load libraries
INDEX******************************************************************************************************************
Figure 1. a. (left) 407, stats 335 (right) 245 b. 2785, stats 2862 c. 2953 d. 2625
Figure 2. a. 730 stats, 611 b. 1262, stats 1144 c. 2785, stats 2862 d. 2625 e. 3279 f. right 3328; left 3305
Figure S3. 460
Figure S4. a. 945 b. 1422 c. 1023
Figure S5. a. 1222 b. 1317 c. 911
Figure S6. a. 1463 b. 1526 c. 1624
Figure 3. b. 1979 c. 2231 f. 2784, stats 2862 g. 2625
Figure 4. a. 2902, stats 2883 b. left 3048; deviance 3070; right
2953
c. 2625, stats 2743
Figure 5. a. 4365 b. 2841, stats 2862 c. 2962, 3464 d. 3159 e. left 2700, right 3493 f. 3440, 3411
Figure 6. a. 3790 b. 4501 c. 4270, 4231, 3950 d. 3790, 3950, 4028 e. 4115, 4155, 4221; stats 4073, 4222
PRS****************************************************************************************************************
Input the PRS data
# Read the PRS data
prs_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/PRS_array_for_manuscript_fixed.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)
Set color palette
group_colors <- c("grey","#3C8C78")
Define your list of traits to analyze
# List of traits to analyze
traits <- c("Heart_rate", "LQTS", "PR_interval", "QTc", "Afib", "HRV", "BP", "QRS", "HCM", "LVEDV","LVEDVI","LVEF","LVESV","LVESVI","SV","SVI","GGE","Focal_Epilepsy")
traits
## [1] "Heart_rate" "LQTS" "PR_interval" "QTc"
## [5] "Afib" "HRV" "BP" "QRS"
## [9] "HCM" "LVEDV" "LVEDVI" "LVEF"
## [13] "LVESV" "LVESVI" "SV" "SVI"
## [17] "GGE" "Focal_Epilepsy"
Z normalize the scores
prs_data <- prs_data %>%
mutate(across(all_of(traits), ~ scale(.x) %>% as.numeric, .names = "Normalized_{.col}"))
define the cases and controls
group_definitions <- list(
"Control" = c(1),
"Case"= c(2,3,4,5,6)
)
Format the data
# Categorize arrest_ontology into specified groups
prs_data <- prs_data %>%
mutate(arrest_group = case_when(
arrest_ontology %in% group_definitions[["Control"]] ~ "Control",
arrest_ontology %in% group_definitions[["Case"]] ~ "Case",
TRUE ~ as.character(arrest_ontology)
))
# Filter out only the relevant groups for the plot
relevant_groups <- names(group_definitions)
filtered_data <- prs_data %>%
filter(arrest_group %in% relevant_groups)
# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")
# Filter out non-existing columns from the normalized_traits
existing_normalized_traits <- normalized_traits[normalized_traits %in% names(filtered_data)]
# Reshape the filtered data to long format for the normalized traits
melted_data_normalized <- reshape2::melt(filtered_data,
id.vars = c('arrest_group'),
measure.vars = existing_normalized_traits)
Split into discovery or replication
prs_data_discovery <- filtered_data %>%
filter(Set == "Discovery")
prs_data_replication <- filtered_data %>%
filter(Set == "Replication")
Make the Correlation plot
# Select only the relevant columns for correlation
data_for_correlation <- prs_data_discovery[, normalized_traits]
#order the traits
corr_traits <- c(
"Normalized_QRS",
"Normalized_BP",
"Normalized_SVI",
"Normalized_LVESV",
"Normalized_PR_interval",
"Normalized_HRV",
"Normalized_GGE",
"Normalized_LVEDV",
"Normalized_LVESVI",
"Normalized_Focal_Epilepsy",
"Normalized_HCM",
"Normalized_LQTS",
"Normalized_Afib",
"Normalized_LVEDVI",
"Normalized_SV",
"Normalized_Heart_rate",
"Normalized_QTc",
"Normalized_LVEF"
)
# Ensure the DataFrame is ordered according to trait preferences
data_for_correlation_ordered <- data_for_correlation[, corr_traits]
# Calculate correlation matrix
cor_matrix_ordered <- cor(data_for_correlation_ordered, use = "complete.obs")
#Plot correlation
corrplot(cor_matrix_ordered,
method = "color",
type = "full",
tl.col = "black",
tl.srt = 45,
cl.ratio = 0.2,
col = colorRampPalette(c("#05618F", "white", "#F0BE3C"))(200),
diag = FALSE,
addgrid.col = "black"
)
#Show the results
correlation_results <- rcorr(as.matrix(data_for_correlation_ordered), type = "pearson")
correlation_results$P
## Normalized_QRS Normalized_BP Normalized_SVI
## Normalized_QRS NA 9.933678e-01 6.853895e-01
## Normalized_BP 9.933678e-01 NA 1.671754e-06
## Normalized_SVI 6.853895e-01 1.671754e-06 NA
## Normalized_LVESV 4.734779e-05 1.440789e-01 5.176039e-08
## Normalized_PR_interval 6.161903e-03 1.510868e-01 3.070402e-01
## Normalized_HRV 5.899446e-06 3.828959e-01 7.641050e-07
## Normalized_GGE 9.659843e-02 1.270667e-01 7.128341e-03
## Normalized_LVEDV 4.954003e-02 1.918542e-01 4.352831e-03
## Normalized_LVESVI 6.277110e-04 5.726864e-01 5.992465e-01
## Normalized_Focal_Epilepsy 9.955037e-02 3.021332e-01 2.018146e-01
## Normalized_HCM 2.396194e-02 7.604039e-01 2.522892e-02
## Normalized_LQTS 5.021915e-02 2.378957e-01 3.264184e-02
## Normalized_Afib 2.837904e-01 7.458152e-01 7.488965e-04
## Normalized_LVEDVI 4.291581e-02 2.605060e-03 1.172185e-05
## Normalized_SV 7.958341e-01 5.007362e-03 1.326023e-03
## Normalized_Heart_rate 7.704693e-04 7.404989e-02 0.000000e+00
## Normalized_QTc 3.611700e-01 3.700776e-05 1.148859e-12
## Normalized_LVEF 1.031027e-01 1.295488e-01 7.635401e-06
## Normalized_LVESV Normalized_PR_interval
## Normalized_QRS 4.734779e-05 0.006161903
## Normalized_BP 1.440789e-01 0.151086783
## Normalized_SVI 5.176039e-08 0.307040155
## Normalized_LVESV NA 0.279786899
## Normalized_PR_interval 2.797869e-01 NA
## Normalized_HRV 1.727377e-03 0.183954907
## Normalized_GGE 2.297310e-01 0.926572064
## Normalized_LVEDV 0.000000e+00 0.326133747
## Normalized_LVESVI 0.000000e+00 0.977482753
## Normalized_Focal_Epilepsy 3.413192e-01 0.873919485
## Normalized_HCM 0.000000e+00 0.779477674
## Normalized_LQTS 4.128755e-02 0.130976701
## Normalized_Afib 5.958125e-04 0.092859152
## Normalized_LVEDVI 0.000000e+00 0.837999134
## Normalized_SV 6.087284e-09 0.126695164
## Normalized_Heart_rate 9.201374e-03 0.151032458
## Normalized_QTc 6.331019e-01 0.389631858
## Normalized_LVEF 0.000000e+00 0.257495908
## Normalized_HRV Normalized_GGE Normalized_LVEDV
## Normalized_QRS 5.899446e-06 9.659843e-02 4.954003e-02
## Normalized_BP 3.828959e-01 1.270667e-01 1.918542e-01
## Normalized_SVI 7.641050e-07 7.128341e-03 4.352831e-03
## Normalized_LVESV 1.727377e-03 2.297310e-01 0.000000e+00
## Normalized_PR_interval 1.839549e-01 9.265721e-01 3.261337e-01
## Normalized_HRV NA 1.417508e-05 7.232691e-01
## Normalized_GGE 1.417508e-05 NA 4.327716e-01
## Normalized_LVEDV 7.232691e-01 4.327716e-01 NA
## Normalized_LVESVI 4.718379e-01 3.926784e-01 0.000000e+00
## Normalized_Focal_Epilepsy 2.515402e-01 9.696364e-01 1.792096e-01
## Normalized_HCM 2.268539e-01 7.662938e-02 9.463941e-11
## Normalized_LQTS 9.602938e-01 6.911754e-01 5.693882e-05
## Normalized_Afib 6.384594e-01 5.193028e-02 4.874951e-01
## Normalized_LVEDVI 3.163305e-01 7.224922e-02 0.000000e+00
## Normalized_SV 5.701852e-01 3.181255e-01 0.000000e+00
## Normalized_Heart_rate 1.551204e-12 1.646706e-01 1.215043e-03
## Normalized_QTc 6.050812e-03 9.795078e-03 9.106381e-03
## Normalized_LVEF 1.320877e-02 3.297949e-01 1.024596e-07
## Normalized_LVESVI Normalized_Focal_Epilepsy
## Normalized_QRS 0.000627711 0.09955037
## Normalized_BP 0.572686369 0.30213319
## Normalized_SVI 0.599246487 0.20181462
## Normalized_LVESV 0.000000000 0.34131916
## Normalized_PR_interval 0.977482753 0.87391948
## Normalized_HRV 0.471837901 0.25154018
## Normalized_GGE 0.392678433 0.96963636
## Normalized_LVEDV 0.000000000 0.17920961
## Normalized_LVESVI NA 0.30285180
## Normalized_Focal_Epilepsy 0.302851796 NA
## Normalized_HCM 0.000000000 0.95432712
## Normalized_LQTS 0.022630206 0.59571578
## Normalized_Afib 0.617610997 0.90444909
## Normalized_LVEDVI 0.000000000 0.70452838
## Normalized_SV 0.016279998 0.71995021
## Normalized_Heart_rate 0.285679168 0.19226455
## Normalized_QTc 0.071460435 0.97448035
## Normalized_LVEF 0.000000000 0.87505014
## Normalized_HCM Normalized_LQTS Normalized_Afib
## Normalized_QRS 2.396194e-02 5.021915e-02 0.2837904158
## Normalized_BP 7.604039e-01 2.378957e-01 0.7458151593
## Normalized_SVI 2.522892e-02 3.264184e-02 0.0007488965
## Normalized_LVESV 0.000000e+00 4.128755e-02 0.0005958125
## Normalized_PR_interval 7.794777e-01 1.309767e-01 0.0928591521
## Normalized_HRV 2.268539e-01 9.602938e-01 0.6384594227
## Normalized_GGE 7.662938e-02 6.911754e-01 0.0519302805
## Normalized_LVEDV 9.463941e-11 5.693882e-05 0.4874950835
## Normalized_LVESVI 0.000000e+00 2.263021e-02 0.6176109965
## Normalized_Focal_Epilepsy 9.543271e-01 5.957158e-01 0.9044490887
## Normalized_HCM NA 2.747912e-01 0.2051656313
## Normalized_LQTS 2.747912e-01 NA 0.6090492099
## Normalized_Afib 2.051656e-01 6.090492e-01 NA
## Normalized_LVEDVI 3.149592e-06 5.104404e-06 0.8024317643
## Normalized_SV 5.644828e-01 2.725126e-06 0.7848328230
## Normalized_Heart_rate 7.267055e-01 2.228441e-01 0.0174432137
## Normalized_QTc 9.979204e-01 0.000000e+00 0.1904777957
## Normalized_LVEF 0.000000e+00 5.014860e-01 0.0219999301
## Normalized_LVEDVI Normalized_SV Normalized_Heart_rate
## Normalized_QRS 4.291581e-02 7.958341e-01 7.704693e-04
## Normalized_BP 2.605060e-03 5.007362e-03 7.404989e-02
## Normalized_SVI 1.172185e-05 1.326023e-03 0.000000e+00
## Normalized_LVESV 0.000000e+00 6.087284e-09 9.201374e-03
## Normalized_PR_interval 8.379991e-01 1.266952e-01 1.510325e-01
## Normalized_HRV 3.163305e-01 5.701852e-01 1.551204e-12
## Normalized_GGE 7.224922e-02 3.181255e-01 1.646706e-01
## Normalized_LVEDV 0.000000e+00 0.000000e+00 1.215043e-03
## Normalized_LVESVI 0.000000e+00 1.628000e-02 2.856792e-01
## Normalized_Focal_Epilepsy 7.045284e-01 7.199502e-01 1.922646e-01
## Normalized_HCM 3.149592e-06 5.644828e-01 7.267055e-01
## Normalized_LQTS 5.104404e-06 2.725126e-06 2.228441e-01
## Normalized_Afib 8.024318e-01 7.848328e-01 1.744321e-02
## Normalized_LVEDVI NA 0.000000e+00 2.187902e-01
## Normalized_SV 0.000000e+00 NA 1.209219e-02
## Normalized_Heart_rate 2.187902e-01 1.209219e-02 NA
## Normalized_QTc 1.142112e-03 4.917063e-06 2.184145e-02
## Normalized_LVEF 3.606922e-03 1.979529e-02 1.680620e-02
## Normalized_QTc Normalized_LVEF
## Normalized_QRS 3.611700e-01 1.031027e-01
## Normalized_BP 3.700776e-05 1.295488e-01
## Normalized_SVI 1.148859e-12 7.635401e-06
## Normalized_LVESV 6.331019e-01 0.000000e+00
## Normalized_PR_interval 3.896319e-01 2.574959e-01
## Normalized_HRV 6.050812e-03 1.320877e-02
## Normalized_GGE 9.795078e-03 3.297949e-01
## Normalized_LVEDV 9.106381e-03 1.024596e-07
## Normalized_LVESVI 7.146043e-02 0.000000e+00
## Normalized_Focal_Epilepsy 9.744803e-01 8.750501e-01
## Normalized_HCM 9.979204e-01 0.000000e+00
## Normalized_LQTS 0.000000e+00 5.014860e-01
## Normalized_Afib 1.904778e-01 2.199993e-02
## Normalized_LVEDVI 1.142112e-03 3.606922e-03
## Normalized_SV 4.917063e-06 1.979529e-02
## Normalized_Heart_rate 2.184145e-02 1.680620e-02
## Normalized_QTc NA 8.490931e-02
## Normalized_LVEF 8.490931e-02 NA
Test for ANOVA assumptions
check_anova_assumptions <- function(data, trait) {
# Ensure 'arrest_group' is a factor
data$arrest_group <- as.factor(data$arrest_group)
# Fit the ANOVA model
formula <- as.formula(paste(trait, "~ arrest_group"))
anova_model <- aov(formula, data = data)
# Extract residuals
residuals <- residuals(anova_model)
# Shapiro-Wilk test for normality
shapiro_test <- shapiro.test(residuals)
shapiro_p_value <- shapiro_test$p.value
# Levene's Test for homogeneity of variances
levene_test <- leveneTest(formula, data = data)
levene_p_value <- levene_test$`Pr(>F)`[1]
# Bartlett's Test for homogeneity of variances
bartlett_test <- bartlett.test(formula, data = data)
bartlett_p_value <- bartlett_test$p.value
# Create a summary table with the test results
data.frame(
Trait = trait,
Shapiro_Wilk_p_value = shapiro_p_value,
Levene_p_value = levene_p_value,
Bartlett_p_value = bartlett_p_value
)
}
anova_assumptions_results <- lapply(normalized_traits, function(trait) check_anova_assumptions(prs_data_discovery, trait))
# Combine the results into a single data frame
anova_assumptions_df <- do.call(rbind, anova_assumptions_results)
# Print the results
print(anova_assumptions_df)
## Trait Shapiro_Wilk_p_value Levene_p_value
## 1 Normalized_Heart_rate 3.670786e-05 3.402430e-01
## 2 Normalized_LQTS 8.582805e-01 8.524520e-01
## 3 Normalized_PR_interval 9.626637e-02 7.067574e-01
## 4 Normalized_QTc 1.378815e-07 4.084220e-01
## 5 Normalized_Afib 5.252293e-01 4.766202e-01
## 6 Normalized_HRV 2.678043e-13 1.221016e-01
## 7 Normalized_BP 2.145452e-02 8.637621e-01
## 8 Normalized_QRS 1.508077e-39 2.042997e-06
## 9 Normalized_HCM 3.144379e-03 9.212937e-01
## 10 Normalized_LVEDV 1.331310e-01 2.218475e-04
## 11 Normalized_LVEDVI 5.919657e-02 7.738241e-03
## 12 Normalized_LVEF 2.471126e-02 3.111787e-01
## 13 Normalized_LVESV 1.169793e-02 9.853473e-01
## 14 Normalized_LVESVI 7.239432e-04 5.086718e-01
## 15 Normalized_SV 1.118507e-05 5.214914e-01
## 16 Normalized_SVI 8.610508e-04 8.519458e-01
## 17 Normalized_GGE 4.448074e-02 6.815913e-01
## 18 Normalized_Focal_Epilepsy 3.025018e-03 6.100802e-01
## Bartlett_p_value
## 1 6.287623e-02
## 2 7.465532e-01
## 3 8.498751e-01
## 4 5.892923e-01
## 5 5.905392e-01
## 6 6.549201e-02
## 7 8.042278e-01
## 8 1.040405e-24
## 9 8.033151e-01
## 10 9.995313e-04
## 11 1.890503e-02
## 12 4.997414e-01
## 13 8.378271e-01
## 14 2.967646e-01
## 15 6.599025e-01
## 16 6.489996e-01
## 17 5.545373e-01
## 18 4.610240e-01
Since some do not meet ANOVA criteria, we go with nonparametric
# Define Wilcoxon Rank Sum test function with median difference calculation
perform_wilcoxon_test <- function(data, trait) {
# Convert data to appropriate format
data <- data %>%
dplyr::select(arrest_group, all_of(trait)) %>%
drop_na()
# Calculate medians for each group
group_medians <- data %>%
group_by(arrest_group) %>%
summarise(median_value = median(!!sym(trait)))
# Calculate the difference in medians
diff_median <- diff(group_medians$median_value)
# Perform Wilcoxon rank-sum test
wilcoxon_test_result <- wilcox_test(data, as.formula(paste(trait, "~ arrest_group")))
# Extract p-value
p_value <- wilcoxon_test_result$p
# Adjust p-value using Bonferroni correction
p_adjusted <- p.adjust(p_value, method = "bonferroni", n = 18)
# Create a data frame with the test results
result_df <- data.frame(
Trait = trait,
p_value = p_value,
p_adjusted = p_adjusted,
significant = p_adjusted < 0.05,
diff_median = diff_median
)
return(result_df)
}
# Perform Wilcoxon test for each trait and store results
wilcoxon_results <- lapply(normalized_traits, function(trait) perform_wilcoxon_test(prs_data_discovery, trait))
# Combine all Wilcoxon results into a single data frame
combined_wilcoxon_results <- do.call(rbind, wilcoxon_results)
# Print combined Wilcoxon results with p-values and median differences
print("Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:")
## [1] "Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:"
print(combined_wilcoxon_results)
## Trait p_value p_adjusted significant diff_median
## 1 Normalized_Heart_rate 1.24e-02 2.2320e-01 FALSE 0.088165794
## 2 Normalized_LQTS 1.91e-01 1.0000e+00 FALSE 0.112902629
## 3 Normalized_PR_interval 2.34e-02 4.2120e-01 FALSE -0.160790733
## 4 Normalized_QTc 4.67e-03 8.4060e-02 FALSE 0.179095083
## 5 Normalized_Afib 1.07e-01 1.0000e+00 FALSE 0.080017842
## 6 Normalized_HRV 1.51e-01 1.0000e+00 FALSE -0.089874654
## 7 Normalized_BP 1.21e-06 2.1780e-05 TRUE -0.275654610
## 8 Normalized_QRS 1.12e-08 2.0160e-07 TRUE -0.128954882
## 9 Normalized_HCM 4.76e-01 1.0000e+00 FALSE 0.031802432
## 10 Normalized_LVEDV 6.38e-01 1.0000e+00 FALSE -0.044351499
## 11 Normalized_LVEDVI 7.37e-02 1.0000e+00 FALSE 0.118958269
## 12 Normalized_LVEF 1.76e-03 3.1680e-02 TRUE 0.124406676
## 13 Normalized_LVESV 7.02e-03 1.2636e-01 FALSE -0.124673369
## 14 Normalized_LVESVI 9.49e-01 1.0000e+00 FALSE -0.011000752
## 15 Normalized_SV 3.55e-02 6.3900e-01 FALSE 0.146894687
## 16 Normalized_SVI 1.77e-04 3.1860e-03 TRUE -0.360642163
## 17 Normalized_GGE 3.20e-01 1.0000e+00 FALSE -0.061833861
## 18 Normalized_Focal_Epilepsy 9.01e-01 1.0000e+00 FALSE 0.001265513
Make the lolliplot
# Define the groups for metrics and syndromes
metrics <- c("Normalized_Heart_rate", "Normalized_PR_interval", "Normalized_QTc", "Normalized_HRV", "Normalized_BP", "Normalized_QRS",
"Normalized_LVEDV", "Normalized_LVEDVI", "Normalized_LVEF", "Normalized_LVESV", "Normalized_LVESVI", "Normalized_SV", "Normalized_SVI")
syndromes <- c("Normalized_LQTS", "Normalized_Afib", "Normalized_HCM", "Normalized_GGE", "Normalized_Focal_Epilepsy")
# Categorize each trait as 'Metrics' or 'Syndromes'
combined_wilcoxon_results$Category <- ifelse(combined_wilcoxon_results$Trait %in% metrics, "Metrics",
ifelse(combined_wilcoxon_results$Trait %in% syndromes, "Syndromes", "Combined"))
# Reorder the levels of Trait based on the Category
combined_wilcoxon_results <- combined_wilcoxon_results %>%
mutate(Trait = factor(Trait, levels = unique(Trait[order(Category)])))
# Transform P-values to -log10 scale
combined_wilcoxon_results$NegLogP <- -log10(combined_wilcoxon_results$p_value)
# Calculate the -log10(0.05) for the reference line
threshold <- -log10(0.05)
threshold2 <- -log10(0.05/18)
# Create a new variable for coloring based on P-value significance (already computed in the original df)
combined_wilcoxon_results$Significant <- ifelse(combined_wilcoxon_results$p_value < 0.05, "Significant", "Not Significant")
# Combine metrics and syndromes into a single ordered list
ordered_traits <- c(
"Normalized_QRS",
"Normalized_BP",
"Normalized_SVI",
"Normalized_LVESV",
"Normalized_PR_interval",
"Normalized_HRV",
"Normalized_GGE",
"Normalized_LVEDV",
"Normalized_LVESVI",
"Normalized_Focal_Epilepsy",
"Normalized_HCM",
"Normalized_LQTS",
"Normalized_Afib",
"Normalized_LVEDVI",
"Normalized_SV",
"Normalized_Heart_rate",
"Normalized_QTc",
"Normalized_LVEF"
)
# Update the 'Trait' column to have this specific order
combined_wilcoxon_results$Trait <- factor(combined_wilcoxon_results$Trait, levels = ordered_traits)
# Create the plot, coloring by diff_median
p3 <- ggplot(combined_wilcoxon_results, aes(x = Trait, y = NegLogP, color = diff_median)) +
geom_segment(aes(xend = Trait, yend = 0), size = 1) +
geom_point(size = 3) +
geom_hline(yintercept = threshold, linetype = "dotted", color = "Black") +
geom_hline(yintercept = threshold2, linetype = "dotted", color = "Red") +
scale_color_gradient2(low = "#3C8C78", mid = "white", high = "black", midpoint = 0) +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
strip.text.x = element_text(size = 10)) +
labs(y = "-log10(P-value)", x = "Trait", color = "Median Difference", title = "Lollipop Plot of -log10(P-values) by Trait")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(p3)
# Function to generate plots for a given trait
generate_plots_for_trait <- function(data, trait, group_colors) {
# Rank the scores for the specified trait
data$rank <- rank(data[[trait]], na.last = "keep")
# Density plot for the trait
density_plot <- ggplot(data, aes(x = rank, fill = arrest_group, y = after_stat(density))) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = group_colors) +
labs(title = paste("Relative Density of Overall", trait, "Trait Ranks Across Arrest Groups"),
x = paste("Overall Rank of", trait, "Scores"),
y = "Relative Density") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# CDF plot for the trait
cdf_plot <- ggplot(data, aes(x = rank, color = arrest_group)) +
stat_ecdf(geom = "step", size = 1) +
scale_color_manual(values = group_colors) +
labs(title = paste("CDF of Overall", trait, "Trait Ranks Across Arrest Groups"),
x = paste("Overall Rank of", trait, "Scores"),
y = "Cumulative Proportion") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Combine the plots
combined_plot <- plot_grid(density_plot, cdf_plot, ncol = 1, align = 'v', labels = c("A", "B"))
# Return the combined plot
return(combined_plot)
}
# List of traits to plot
traits <- c("BP", "Afib", "QRS","LVEF","Heart_rate")
# Initialize an empty list to store plots
plots_list <- list()
# Loop through each trait and generate plots, storing them in the list
for(trait in traits) {
plots_list[[trait]] <- generate_plots_for_trait(prs_data_discovery, trait, group_colors)
}
######## Access each plot by its trait name from the plots_list ##########
# For example, to print the plot for QRS, use:
print(plots_list[["QRS"]])
print(plots_list[["BP"]])
Coding regions******************************************************************************************************************
Bring in data
coding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/statistics_by_individual_replication_for_manuscript_fixed_NEW.txt", header = TRUE, sep = "\t")
coding_stats_discovery <- coding_stats[coding_stats$Replication == 0, ]
Categorize the data
categorized_coding_stats_discovery <- coding_stats_discovery %>%
mutate(Category_Group = case_when(
Category == 1 ~ "Control",
Category %in% 2:6 ~ "Case"
))
Merge some of the column data to get the desired allele frequency intervals
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
mutate(
Epilepsy_01_00001 = Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001,
Epilepsy_00001 = Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
Epilepsy_total_missense = Epilepsy_missense_common + Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001 +
Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
Cardiomyopathy_01_00001 = Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001,
Cardiomyopathy_00001 = Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton,
Cardiomyopathy_total_missense = Cardiomyopathy_missense_common + Cardiomyopathy_missense_variant_0.01 +
Cardiomyopathy_missense_variant_0.001 + Cardiomyopathy_missense_variant_0.00001 +
Cardiomyopathy_missense_singleton
)
# Clean up the data: remove 'NA's, 'Inf's, and select only needed columns
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
dplyr::select(-ID, -Category) %>%
na.omit() %>%
dplyr::filter(if_all(everything(), ~ !is.infinite(.)))
# Aggregate the data to compute mean and standard deviation
collapsed_coding_stats <- categorized_coding_stats_discovery %>%
group_by(Category_Group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
std = ~sd(., na.rm = TRUE))))
# Clean up the collapsed data
collapsed_coding_stats <- na.omit(collapsed_coding_stats)
Perform permutation simulation and test
# List of coding variables to analyze
coding_variables_to_analyze <- c("Cardiomyopathy_HIGH", "Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained", "Cardiomyopathy_01_00001",
"Cardiomyopathy_00001", "Cardiomyopathy_total_missense", "PLP_Cardiomyopathy",
"Epilepsy_HIGH", "Epilepsy_start_lost",
"Epilepsy_stop_gained", "Epilepsy_01_00001",
"Epilepsy_00001", "Epilepsy_total_missense", "PLP_Epilepsy")
# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
# Check for sufficient group sizes
if (sum(data[[group_var]] == "Case", na.rm = TRUE) > 1 &&
sum(data[[group_var]] == "Control", na.rm = TRUE) > 1) {
# Calculate the observed difference in means between the two groups
observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"], na.rm = TRUE) -
mean(data[[var]][data[[group_var]] == "Control"], na.rm = TRUE))
# Create an empty vector to store the permutation statistics
perm_stats <- numeric(num_permutations)
# Perform the permutations
for (i in 1:num_permutations) {
# Randomly shuffle the group labels
permuted_group <- sample(data[[group_var]])
# Calculate the difference in means for the permuted data
permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"], na.rm = TRUE) -
mean(data[[var]][permuted_group == "Control"], na.rm = TRUE))
# Store the permuted statistic
perm_stats[i] <- permuted_stat
}
# Calculate the p-value (proportion of permuted stats >= observed stat)
p_value <- mean(perm_stats >= observed_stat)
return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
} else {
warning(paste("Skipping variable", var, "due to insufficient group size"))
return(NULL)
}
}
# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)
# Loop through each variable and perform the permutation test
for (var in coding_variables_to_analyze) {
# Run the permutation test
test_result <- permutation_test(categorized_coding_stats_discovery, var, "Category_Group")
if (!is.null(test_result)) {
# Extract the permuted statistics, observed statistic, and p-value
perm_stats <- test_result$perm_stats
observed_stat <- test_result$observed_stat
p_value <- test_result$p_value
# Store the p-value and observed statistic in the dataframe
p_value_results <- rbind(p_value_results, data.frame(Variable = var,
Observed_Stat = observed_stat,
p_value = p_value))
# Create a data frame for plotting the permutation distribution
perm_df <- data.frame(perm_stats = perm_stats)
# Plot the permutation distribution with the observed statistic marked and p-value
p <- ggplot(perm_df, aes(x = perm_stats)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
labs(title = paste("Permutation Test for", var),
x = "Permuted Statistic",
y = "Frequency",
subtitle = paste("Observed Statistic =", round(observed_stat, 4),
"| P-value =", format(p_value, digits = 4))) + # Add p-value to subtitle
theme_minimal()
# Print the plot
print(p)
}
}
# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
## Variable Observed_Stat p_value
## 1 Cardiomyopathy_HIGH 0.67487504 0.0004
## 2 Cardiomyopathy_start_lost 0.03608007 0.0001
## 3 Cardiomyopathy_stop_gained 0.27295893 0.0000
## 4 Cardiomyopathy_01_00001 2.19304371 0.0000
## 5 Cardiomyopathy_00001 0.55399147 0.0015
## 6 Cardiomyopathy_total_missense 8.02403705 0.0000
## 7 PLP_Cardiomyopathy 0.19359502 0.0000
## 8 Epilepsy_HIGH 0.70615260 0.0091
## 9 Epilepsy_start_lost 0.01129570 0.0507
## 10 Epilepsy_stop_gained 0.34350191 0.0000
## 11 Epilepsy_01_00001 1.45746349 0.0000
## 12 Epilepsy_00001 0.56937910 0.0037
## 13 Epilepsy_total_missense 2.48950064 0.0002
## 14 PLP_Epilepsy 0.01211653 0.5191
Create the Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
# Calculate the mean and standard deviation for Control
group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
# Calculate the ratio for each group relative to Group 1
data_summary <- data %>%
group_by(Category_Group) %>%
summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
sd_value = sd(.data[[col_name]], na.rm = TRUE),
n = n()) %>%
mutate(ratio = mean_value / group1_mean,
sem_value = sd_value / sqrt(n),
sem_ratio = sem_value / group1_mean, # SEM scaled to the group 1, just like the mean value
ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
return(list(summary_data = data_summary))
}
Plot the data relative to group 1
# Initialize an empty dataframe to store summary data
combined_coding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = numeric(), std = numeric(), stringsAsFactors = FALSE)
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_coding_stats_discovery), c("ID", "Category", "Category_Group"))
# Loop through each column and plot relative to Category_Group1 (Crontrol)
for (col in columns_to_plot) {
plot_data <- plot_column_ratio(categorized_coding_stats_discovery, col)
# Append summary data to combined_coding_stats_summary_df
combined_coding_stats_summary_df <- bind_rows(combined_coding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
# Subset the data for the specified variables
selected_coding_data <- combined_coding_stats_summary_df %>%
filter(col_name %in% c(
"Cardiomyopathy_total_missense",
"Cardiomyopathy_01_00001",
"Cardiomyopathy_00001",
"Epilepsy_total_missense",
"Epilepsy_01_00001",
"Epilepsy_00001",
"PLP_Cardiomyopathy",
"PLP_Epilepsy",
"Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained",
"Cardiomyopathy_HIGH",
"Epilepsy_start_lost",
"Epilepsy_HIGH",
"Epilepsy_stop_gained"
),
Category_Group != "Control")
# Define the specific order for "Epilepsy" and CMAR variants
levels_order <- c(
"PLP_Epilepsy",
"Epilepsy_00001",
"Epilepsy_01_00001",
"Epilepsy_total_missense",
"Epilepsy_start_lost",
"Epilepsy_stop_gained",
"Epilepsy_HIGH",
"PLP_Cardiomyopathy",
"Cardiomyopathy_00001",
"Cardiomyopathy_01_00001",
"Cardiomyopathy_total_missense",
"Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained",
"Cardiomyopathy_HIGH"
)
selected_coding_data$col_name <- factor(selected_coding_data$col_name, levels = levels_order)
# Plot
coding_stats_plot <- ggplot(selected_coding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
geom_point(position = position_dodge(width = 1), size = 3) +
geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 1) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_color_manual(values = "#3C8C78") +
labs(title = "Ratio Compared to Control +/-SEM",
y = "Variant",
x = "Ratio to Control Mean",
color = "Category Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8, hjust = 1),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
plot.title = element_text(size = 8, hjust = 0.5),
plot.subtitle = element_text(size = 8),
plot.caption = element_text(size = 8),
plot.margin = margin(15, 15, 15, 15)) +
scale_x_continuous(limits = c(-2, 12))
print(coding_stats_plot)
# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
## Variable Observed_Stat p_value
## 1 Cardiomyopathy_HIGH 0.67487504 0.0004
## 2 Cardiomyopathy_start_lost 0.03608007 0.0001
## 3 Cardiomyopathy_stop_gained 0.27295893 0.0000
## 4 Cardiomyopathy_01_00001 2.19304371 0.0000
## 5 Cardiomyopathy_00001 0.55399147 0.0015
## 6 Cardiomyopathy_total_missense 8.02403705 0.0000
## 7 PLP_Cardiomyopathy 0.19359502 0.0000
## 8 Epilepsy_HIGH 0.70615260 0.0091
## 9 Epilepsy_start_lost 0.01129570 0.0507
## 10 Epilepsy_stop_gained 0.34350191 0.0000
## 11 Epilepsy_01_00001 1.45746349 0.0000
## 12 Epilepsy_00001 0.56937910 0.0037
## 13 Epilepsy_total_missense 2.48950064 0.0002
## 14 PLP_Epilepsy 0.01211653 0.5191
Input the data
# Pull in the gene data
gene_data <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_variants_by_gene_for_manuscript_fixed_NEW.csv")
# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript_fixed.txt", sep = "\t", header = TRUE)
# add the arrest category to each
gene_data$Category <- cohort$arrest_ontology[match(gene_data$ID, cohort$CGM_id)]
gene_data_discovery <- gene_data %>%
filter(Set == "Discovery")
Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize
variants_per_gene_Cat_2_6 <- gene_data_discovery %>%
filter(Category > 1) %>%
group_by(GENE) %>%
summarise(
HIGH = sum(HIGH, na.rm = TRUE),
PLP = sum(PLP, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_2_6)
## # A tibble: 356 × 3
## GENE HIGH PLP
## <chr> <int> <int>
## 1 A2ML1 57 0
## 2 AARS 6 0
## 3 ABAT 0 0
## 4 ABCC9 4 0
## 5 ACADVL 4 0
## 6 ACTC1 1 0
## 7 ACTL6B 0 0
## 8 ACTN2 7 0
## 9 ADAM22 3 0
## 10 ADSL 1 1
## # ℹ 346 more rows
Filter for Category == 1 and then group and summarize
variants_per_gene_Cat_1 <- gene_data_discovery %>%
filter(Category == 1) %>%
group_by(GENE) %>%
summarise(
HIGH = sum(HIGH, na.rm = TRUE),
PLP = sum(PLP, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_1)
## # A tibble: 341 × 3
## GENE HIGH PLP
## <chr> <int> <int>
## 1 A2ML1 69 0
## 2 AARS 0 0
## 3 ABAT 0 0
## 4 ABCC9 3 0
## 5 ACADVL 6 3
## 6 ACTL6B 0 2
## 7 ACTN2 1 0
## 8 ADAM22 0 0
## 9 ADSL 0 0
## 10 AGL 0 0
## # ℹ 331 more rows
Load gene lists from text files
genes_CMAR1 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR1_list.txt")
genes_CMAR2 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR2_list.txt")
genes_EIEE_OMIM <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_EIEE_OMIM_list.txt")
genes_Epilepsy <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_Epilepsy_list.txt")
Annotate gene with source panel
genes_CMAR1 <- data.frame(Gene = genes_CMAR1, Source = "Cardiomyopathy")
genes_CMAR2 <- data.frame(Gene = genes_CMAR2, Source = "Cardiomyopathy")
genes_EIEE_OMIM <- data.frame(Gene = genes_EIEE_OMIM, Source = "Epilepsy")
genes_Epilepsy <- data.frame(Gene = genes_Epilepsy, Source = "Epilepsy")
# Replace "\"AARS1\"" with "AARS"
genes_EIEE_OMIM$Gene <- ifelse(genes_EIEE_OMIM$Gene == "\"AARS1\"", "AARS", genes_EIEE_OMIM$Gene)
# Replace "\"KIAA2022\"" with "NEXMIF"
genes_Epilepsy$Gene <- ifelse(genes_Epilepsy$Gene == "\"NEXMIF\"", "KIAA2022", genes_Epilepsy$Gene)
Append panel source to the gene_data
# Combine all lists into one dataframe
all_genes <- rbind(genes_CMAR1, genes_CMAR2, genes_EIEE_OMIM, genes_Epilepsy)
# Sort by Gene and Source to ensure "Cardiomyopathy" comes first
all_genes <- all_genes[order(all_genes$Gene, all_genes$Source), ]
# Remove duplicates, keeping the first occurrence
all_genes <- all_genes[!duplicated(all_genes$Gene), ]
# Clean the 'Gene' column in 'all_genes' by removing quotes and backslashes
all_genes$Gene <- gsub('\"', '', all_genes$Gene)
# Ensure that the 'GENE' column in 'gene_data_discovery' and 'Gene' in 'all_genes' are character
gene_data_discovery$GENE <- as.character(gene_data_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)
# find the index of each gene in 'all_genes' and use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_discovery$Source <- all_genes$Source[match(gene_data_discovery$GENE, all_genes$Gene)]
# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'
# View the first few rows to verify the new 'Source' has been added
head(gene_data_discovery)
## ID GENE HIGH PLP Set Category Source
## 1 CGM0000001 A2ML1 0 0 Discovery 1 Cardiomyopathy
## 2 CGM0000001 ABAT 0 0 Discovery 1 Epilepsy
## 3 CGM0000001 ADAM22 0 0 Discovery 1 Epilepsy
## 4 CGM0000001 AKAP9 0 0 Discovery 1 Cardiomyopathy
## 5 CGM0000001 ALDH5A1 0 0 Discovery 1 Epilepsy
## 6 CGM0000001 ALMS1 0 0 Discovery 1 Cardiomyopathy
Add the source to the category 1 variants list
variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_1)
## # A tibble: 6 × 4
## GENE HIGH PLP Source
## <chr> <int> <int> <chr>
## 1 A2ML1 69 0 Cardiomyopathy
## 2 AARS 0 0 Epilepsy
## 3 ABAT 0 0 Epilepsy
## 4 ABCC9 3 0 Cardiomyopathy
## 5 ACADVL 6 3 Cardiomyopathy
## 6 ACTL6B 0 2 Epilepsy
Add the source to the category 2-6 variants list
variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_2_6)
## # A tibble: 6 × 4
## GENE HIGH PLP Source
## <chr> <int> <int> <chr>
## 1 A2ML1 57 0 Cardiomyopathy
## 2 AARS 6 0 Epilepsy
## 3 ABAT 0 0 Epilepsy
## 4 ABCC9 4 0 Cardiomyopathy
## 5 ACADVL 4 0 Cardiomyopathy
## 6 ACTC1 1 0 Cardiomyopathy
combine the variants together now
# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
mutate(Category = "1")
# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
mutate(Category = "2-6")
# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)
# Print the combined dataset
print(combined_variants)
## # A tibble: 697 × 5
## GENE HIGH PLP Source Category
## <chr> <int> <int> <chr> <chr>
## 1 A2ML1 69 0 Cardiomyopathy 1
## 2 AARS 0 0 Epilepsy 1
## 3 ABAT 0 0 Epilepsy 1
## 4 ABCC9 3 0 Cardiomyopathy 1
## 5 ACADVL 6 3 Cardiomyopathy 1
## 6 ACTL6B 0 2 Epilepsy 1
## 7 ACTN2 1 0 Cardiomyopathy 1
## 8 ADAM22 0 0 Epilepsy 1
## 9 ADSL 0 0 Epilepsy 1
## 10 AGL 0 0 Cardiomyopathy 1
## # ℹ 687 more rows
Plot the number of variant types in cases and controls
# Filter dataset where High > 0
combined_variants_High <- combined_variants %>% filter(HIGH > 0)
# Create a label function
custom_labeller <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Create the plot with custom labeller
High_variant_plot <- ggplot(combined_variants_High, aes(x = GENE, y = Category, fill = HIGH)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "High Variants Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller("Category", x))
# Print the plot
print(High_variant_plot)
Plot the number of variant types in cases and controls
# Create a custom labeller function for PLP_plot
custom_labeller_plp <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Filter datasets where PLP > 0
combined_variants_PLP <- combined_variants %>% filter(PLP > 0)
# Create the PLP plot with custom labeller
PLP_plot <- ggplot(combined_variants_PLP, aes(x = GENE, y = Category, fill = PLP)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "PLP Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller_plp("Category", x))
# Print the plot
print(PLP_plot)
Now bring in the module data
modules <- read_excel("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/cardio_gene_sets.xlsx")
annotate the gene_data with the modules
module_data <- left_join(gene_data_discovery, modules, by = c("GENE" = "Gene"))
# View the first few rows of the updated data
head(module_data)
## ID GENE HIGH PLP Set Category Source
## 1 CGM0000001 A2ML1 0 0 Discovery 1 Cardiomyopathy
## 2 CGM0000001 ABAT 0 0 Discovery 1 Epilepsy
## 3 CGM0000001 ADAM22 0 0 Discovery 1 Epilepsy
## 4 CGM0000001 AKAP9 0 0 Discovery 1 Cardiomyopathy
## 5 CGM0000001 ALDH5A1 0 0 Discovery 1 Epilepsy
## 6 CGM0000001 ALMS1 0 0 Discovery 1 Cardiomyopathy
## Module
## 1 Rasopathy
## 2 <NA>
## 3 <NA>
## 4 Membrane_polarity
## 5 <NA>
## 6 fate_specification
NOW PLOT THE PLPs by their expert-defined categories. The size is the relative number of variants per gene in the category. the Color is the absolute number of variants
#Filter NAs ans aggregate the data
module_data <- module_data %>%
filter(!is.na(Module))
module_data <- module_data %>%
mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))
# Count up the variants
module_summary <- module_data %>%
group_by(Module, Category_Group) %>%
summarise(Total_PLP_variant = sum(PLP, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'Module'. You can override using the
## `.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
group_by(Module) %>%
summarise(Num_Genes = n()) %>%
ungroup()
# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
left_join(genes_per_module, by = c("Module" = "Module"))
# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
mutate(Relative_Size = Total_PLP_variant / Num_Genes)
# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
filter(Category_Group == "Categories 2-6")
module_order <- module_summary_filtered %>%
group_by(Module) %>%
summarise(Sort_Metric = mean(Total_PLP_variant, na.rm = TRUE)) %>%
arrange(desc(Sort_Metric)) %>%
.$Module
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)
# Now plot with the reordered Module
modules_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_PLP_variant, color = Relative_Size)) +
geom_point(shape = 15, alpha = 1) +
scale_size(range = c(3, 20), name = "Number of PLPs") +
scale_color_gradient(low = "Grey", high = "#05618F", name = "# of PLPs relative to Module size") +
theme_minimal() +
labs(title = "Total PLP by Module (Cases)",
x = "Module",
y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(modules_plot)
Build the interaction plot data
#Count the High effect variants
aggregated_data <- module_data %>%
group_by(ID, Module, Category) %>%
summarise(High_count = sum(HIGH),
.groups = 'drop')
#split off controls
data_cat_1 <- aggregated_data %>%
filter(Category %in% c(1))
#identify IDs with any 'High_count' greater than 0
ids_with_both_1 <- data_cat_1 %>%
group_by(ID) %>%
summarise(High_count_any = any(High_count > 0)) %>%
filter(High_count_any) %>%
dplyr::select(ID)
# Filter to include only rows that have IDs with High_count > 0
modules_with_both_1 <- data_cat_1 %>%
semi_join(ids_with_both_1, by = "ID")
#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_1 <- modules_with_both_1 %>%
group_by(ID) %>%
do({
data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
}) %>%
dplyr::rename(Module1 = X1, Module2 = X2) %>%
left_join(modules_with_both_1, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_1, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
filter(High_count_2 > 0 | High_count_1 > 0)
# Count occurrences of module pairs across IDs
module_pair_counts_1 <- module_pairs_for_ids_1 %>%
group_by(Module1, Module2) %>%
summarise(Count = n_distinct(ID), .groups = 'drop')
#split off cases
data_cat_2_6 <- aggregated_data %>%
filter(Category %in% c(2,3,4,5,6))
#identify IDs with any 'High_count' greater than 0
ids_with_both_2_6 <- data_cat_2_6 %>%
group_by(ID) %>%
summarise(Missense_any = any(High_count > 0)) %>%
filter(Missense_any) %>%
dplyr::select(ID)
# Filter to include only rows that have IDs with High_count > 0
modules_with_both_2_6 <- data_cat_2_6 %>%
semi_join(ids_with_both_2_6, by = "ID")
#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_2_6 <- modules_with_both_2_6 %>%
group_by(ID) %>%
do({
data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
}) %>%
dplyr::rename(Module1 = X1, Module2 = X2) %>%
left_join(modules_with_both_2_6, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_2_6, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
filter(High_count_2 > 0 | High_count_1 > 0)
# Count occurrences of module pairs across IDs
module_pair_counts_2_6 <- module_pairs_for_ids_2_6 %>%
group_by(Module1, Module2) %>%
summarise(Count = n_distinct(ID), .groups = 'drop')
# Ensure all combinations are present with at least a zero count for each category
comparison_df <- merge(module_pair_counts_1, module_pair_counts_2_6, by = c("Module1", "Module2"), all = TRUE)
comparison_df[is.na(comparison_df)] <- 0 # Replace NA with 0
# Rename columns
names(comparison_df)[names(comparison_df) == "Count.x"] <- "Count_1"
names(comparison_df)[names(comparison_df) == "Count.y"] <- "Count_2_6"
# Add number of people per group for totals not in counts
comparison_df$Not_Count_1 <- sum(cohort$arrest_ontology == "1") - comparison_df$Count_1
comparison_df$Not_Count_2_6 <- sum(cohort$arrest_ontology %in% c(2,3,4,5,6)) - comparison_df$Count_2_6
# Function to perform Fisher's Exact Test for a row
perform_fisher_test <- function(count_1, not_count_1, count_2_6, not_count_2_6) {
contingency_table <- matrix(c(count_1, not_count_1, count_2_6, not_count_2_6),
nrow = 2,
dimnames = list(c("In Count", "Not in Count"), c("Group_1", "Group_2_6")))
fisher_result <- fisher.test(contingency_table)
return(fisher_result$p.value)
}
# Apply the Fisher function to each row in the dataframe
comparison_df$p_value <- mapply(perform_fisher_test,
comparison_df$Count_1,
comparison_df$Not_Count_1,
comparison_df$Count_2_6,
comparison_df$Not_Count_2_6)
comparison_df$adjusted_p_value <- p.adjust(comparison_df$p_value, method = "bonferroni", n = length(comparison_df$p_value))
# Filter significant results based on adjusted p-values
significant_pairs <- comparison_df %>% filter(adjusted_p_value < 0.05)
# Print significant module pairs
print(significant_pairs)
## Module1 Module2 Count_1 Count_2_6 Not_Count_1
## 1 CICR sarcomere 68 104 528
## 2 DGC fate_specification 5 28 591
## 3 DGC ICD 23 51 573
## 4 DGC Membrane_polarity 14 41 582
## 5 DGC Proteostasis 4 24 592
## 6 DGC sarcomere 27 68 569
## 7 fate_specification mitochondria 15 37 581
## 8 fate_specification nuclear_envelope 2 17 594
## 9 fate_specification sarcomere 28 73 568
## 10 ICD fate_specification 24 57 572
## 11 ICD Membrane_polarity 33 68 563
## 12 ICD Proteostasis 23 55 573
## 13 ICD sarcomere 45 95 551
## 14 lysosomal_storage sarcomere 25 61 571
## 15 Membrane_polarity fate_specification 15 46 581
## 16 Membrane_polarity lysosomal_storage 12 35 584
## 17 Membrane_polarity mitochondria 24 50 572
## 18 Membrane_polarity Proteostasis 14 43 582
## 19 Membrane_polarity sarcomere 37 83 559
## 20 mitochondria sarcomere 37 77 559
## 21 Proteostasis fate_specification 5 31 591
## 22 Proteostasis sarcomere 26 72 570
## 23 Rasopathy sarcomere 92 127 504
## Not_Count_2_6 p_value adjusted_p_value
## 1 419 9.184664e-05 8.358044e-03
## 2 495 7.980239e-06 7.262018e-04
## 3 472 9.256654e-05 8.423555e-03
## 4 482 2.248122e-05 2.045791e-03
## 5 499 2.376165e-05 2.162310e-03
## 6 455 4.739262e-07 4.312729e-05
## 7 486 3.201544e-04 2.913405e-02
## 8 506 2.162122e-04 1.967531e-02
## 9 450 6.060785e-08 5.515314e-06
## 10 466 1.366726e-05 1.243721e-03
## 11 455 1.500246e-05 1.365223e-03
## 12 468 1.660428e-05 1.510989e-03
## 13 428 1.046357e-07 9.521851e-06
## 14 462 2.895091e-06 2.634532e-04
## 15 477 4.241213e-06 3.859504e-04
## 16 488 1.330254e-04 1.210531e-02
## 17 473 2.536897e-04 2.308576e-02
## 18 480 8.210955e-06 7.471969e-04
## 19 440 1.988805e-07 1.809813e-05
## 20 446 2.651718e-06 2.413064e-04
## 21 492 1.281642e-06 1.166294e-04
## 22 451 3.284151e-08 2.988577e-06
## 23 396 2.121104e-04 1.930205e-02
Make the plot for module makeup
#merge the high effect count data
High_data <- rbind(data_cat_1, data_cat_2_6)
#assign the case/control categories
High_data <- High_data %>%
mutate(Category_group = case_when(
Category == 1 ~ "1",
TRUE ~ "2-6"
))
#assign in the module gene count data
df_gene_counts <- data.frame(
Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis",
"Rasopathy", "SNS_PNS", "Z_disc", "cytokine",
"fate_specification", "lysosomal_storage", "mitochondria",
"nuclear_envelope", "sarcomere"),
Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
# scale the high effect date per module based on the genes in the module
High_data_scaled <- High_data %>%
left_join(df_gene_counts, by = "Module")
High_data_scaled <- High_data_scaled %>%
mutate(High_count_per_gene = High_count / Num_Genes)
#compute the means
averages_sem_scaled <- High_data_scaled %>%
dplyr::group_by(Module, Category_group) %>%
dplyr::summarize(
Mean = mean(High_count_per_gene, na.rm = TRUE),
SD = sd(High_count_per_gene, na.rm = TRUE),
N = n(),
SEM = SD / sqrt(N),
.groups = 'drop'
)
#run the t-tests to compare the modules per case or control (Category_group)
test_results <- High_data_scaled %>%
group_by(Module) %>%
do({
ttest <- t.test(High_count_per_gene ~ Category_group, data = .)
tidy(ttest)
}) %>%
ungroup()
# show t-test data
print(test_results)
## # A tibble: 14 × 11
## Module estimate estimate1 estimate2 statistic p.value parameter conf.low
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CICR -6.86e-3 0.0117 0.0185 -1.83 6.77e-2 712. -0.0142
## 2 DGC -3.59e-3 0.000798 0.00439 -2.49 1.31e-2 637. -0.00642
## 3 ICD -7.94e-3 0.00559 0.0135 -2.34 1.96e-2 702. -0.0146
## 4 Membrane_p… -6.76e-3 0.00101 0.00777 -3.95 8.96e-5 483. -0.0101
## 5 Proteostas… -4.79e-3 0.000532 0.00533 -3.01 2.75e-3 509. -0.00792
## 6 Rasopathy -3.10e-3 0.00800 0.0111 -2.06 3.93e-2 897. -0.00604
## 7 SNS_PNS -2.33e-3 0.00463 0.00696 -1.04 3.00e-1 621. -0.00673
## 8 Z_disc 1.02e-2 0.0580 0.0478 1.39 1.65e-1 783. -0.00419
## 9 cytokine 2.88e-2 0.115 0.0866 3.09 2.06e-3 984. 0.0105
## 10 fate_speci… -6.87e-3 0.000508 0.00738 -3.41 7.18e-4 475. -0.0108
## 11 lysosomal_… -2.17e-4 0.00214 0.00236 -0.118 9.06e-1 871. -0.00382
## 12 mitochondr… -1.84e-3 0.000968 0.00281 -2.37 1.83e-2 596. -0.00337
## 13 nuclear_en… -3.13e-3 0 0.00313 -2.01 4.53e-2 318 -0.00620
## 14 sarcomere -1.99e-2 0.00314 0.0230 -4.59 5.60e-6 475. -0.0284
## # ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
#calculate averages
averages_sem_scaled <- High_data_scaled %>%
dplyr::group_by(Module, Category_group) %>%
dplyr::summarize(
Mean = mean(High_count_per_gene),
SD = sd(High_count_per_gene),
N = n(),
SEM = SD / sqrt(N),
.groups = 'drop'
)
# label function for the fill legend
custom_fill_labels <- c("1" = "Control", "2-6" = "Case")
# Plot
High_modules_plot_scaled <- ggplot(averages_sem_scaled, aes(x = Module, y = Mean, fill = Category_group, group = Category_group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "black") +
geom_errorbar(aes(ymin = Mean - SEM, ymax = Mean + SEM), width = .2, position = position_dodge(.8)) +
scale_fill_manual(values = c("1" = "#FF6464", "2-6" = "#05618F"), labels = custom_fill_labels) +
labs(x = "Module", y = "Average High Count Per Gene") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Show plot
print(High_modules_plot_scaled)
Plot the interaction network
#compute the relative counts based on cohort size
comparison_df <- comparison_df %>%
mutate(Relative_Counts = (Count_2_6 / (sum(cohort$arrest_ontology %in% c(2,3,4,5,6))))/(Count_1 / (sum(cohort$arrest_ontology == "1"))))
#assign in the module gene count data
df_gene_counts <- data.frame(
Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis",
"Rasopathy", "SNS_PNS", "Z_disc", "cytokine",
"fate_specification", "lysosomal_storage", "mitochondria",
"nuclear_envelope", "sarcomere"),
Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
significant_edges_df <- comparison_df %>%
filter(adjusted_p_value < 0.05)
# Create an igraph object with edge attributes using filtered dataframe
network_graph <- graph_from_data_frame(d = significant_edges_df[, c("Module1", "Module2")], directed = FALSE)
# Matching Num_Genes from df_gene_counts to vertices in the network_graph
V(network_graph)$Num_Genes <- df_gene_counts$Num_Genes[match(V(network_graph)$name, df_gene_counts$Module)]
# Add edge attributes for adjusted_p_value and Relative_Counts from the filtered data frame
E(network_graph)$adjusted_p_value <- significant_edges_df$adjusted_p_value
E(network_graph)$Relative_Counts <- significant_edges_df$Relative_Counts
# Plot the graph with ggraph
HIGH_network <- ggraph(network_graph, layout = 'fr') +
geom_edge_link(aes(color = -log10(adjusted_p_value), edge_width = Relative_Counts), alpha = 0.8) +
geom_node_point(aes(size = Num_Genes), color = "Black") +
geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.01, "lines")) +
scale_size_continuous(range = c(3, 10)) +
theme_graph()
print(HIGH_network)
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Make the box plot for the module abundance
#Filter NAs ans aggregate the data
module_data <- module_data %>%
filter(!is.na(Module))
module_data <- module_data %>%
mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))
#count up the variants
module_summary <- module_data %>%
group_by(Module, Category_Group) %>%
summarise(Total_HIGH_variant = sum(HIGH, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'Module'. You can override using the
## `.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
group_by(Module) %>%
summarise(Num_Genes = n()) %>%
ungroup()
# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
left_join(genes_per_module, by = c("Module" = "Module"))
# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
mutate(Relative_Size = Total_HIGH_variant / Num_Genes)
# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
filter(Category_Group == "Categories 2-6")
#order them by mean count
module_order <- module_summary_filtered %>%
group_by(Module) %>%
summarise(Sort_Metric = mean(Total_HIGH_variant, na.rm = TRUE)) %>%
arrange(desc(Sort_Metric)) %>%
.$Module
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)
# Now plot with the reordered Module
modules_HIGH_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_HIGH_variant, color = Relative_Size)) +
geom_point(shape = 15, alpha = 1) +
scale_size(range = c(3, 20), name = "Number of HIGHs") +
scale_color_gradient(low = "Grey", high = "#05618F", name = "# of HIGHs relative to Module size") +
theme_minimal() +
labs(title = "Total HIGH by Module (Cases)",
x = "Module",
y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(modules_HIGH_plot)
Pull in uniprot data
# pull down data for MYH7 (uniprot ID P12883)
MYH7_data <- drawProteins::get_features("P12883")
## [1] "Download has worked"
MYH7_data <- drawProteins::feature_to_dataframe(MYH7_data)
# pull down data for MYBPC3 (uniprot ID Q14896)
MYBPC3_data <- drawProteins::get_features("Q14896")
## [1] "Download has worked"
MYBPC3_data <- drawProteins::feature_to_dataframe(MYBPC3_data)
# pull down data for SCN5A (uniprot ID Q14524)
SCN5A_data <- drawProteins::get_features("Q14524")
## [1] "Download has worked"
SCN5A_data <- drawProteins::feature_to_dataframe(SCN5A_data)
Plot PLP variants
MYH7_plot <- draw_canvas(MYH7_data)
MYH7_plot <- draw_chains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_domains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_regions(MYH7_plot, MYH7_data)
#MYH7_plot <- draw_phospho(MYH7_plot, MYH7_data)
variants <- data.frame(
begin = c(1712,1057,908,904,869,741,723,576,369,355),
y = rep(1, 10)
)
# Now, add these points to your plot
MYH7_plot <- MYH7_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes and aesthetics as needed
MYH7_plot <- MYH7_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
MYH7_plot
##################################################################################################
MYBPC3_plot <- draw_canvas(MYBPC3_data)
MYBPC3_plot <- draw_chains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_domains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_regions(MYBPC3_plot, MYBPC3_data)
#MYBPC3_plot <- draw_phospho(MYBPC3_plot, MYBPC3_data)
#######Note, 7 variants are not shown becasue they are splice variants
variants <- data.frame(
begin = c(1098, 1096, 1064, 791, 542, 502, 495, 258, 219),
y = rep(1, 9)
)
# Now, add these points to your plot
MYBPC3_plot <- MYBPC3_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes and aesthetics as needed
MYBPC3_plot <- MYBPC3_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
MYBPC3_plot
##################################################################################################
SCN5A_plot <- draw_canvas(SCN5A_data)
SCN5A_plot <- draw_chains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_domains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_regions(SCN5A_plot, SCN5A_data)
#SCN5A_plot <- draw_phospho(SCN5A_plot, SCN5A_data)
#######Note, 2 variants are not shown becasue they are splice variants
variants <- data.frame(
begin = c(1784,1595,1319,1316,845,828,814),
y = rep(1, 7)
)
# Now, add these points to your plot
SCN5A_plot <- SCN5A_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes as needed
SCN5A_plot <- SCN5A_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
SCN5A_plot
Nonoding******************************************************************************************************************
Pull in the file that is just the ATAC data intervals overlaid on top of the Hi-C intervals which map to the genes of interest
# Read the PC-Hi-C ATAC overlay BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/SFRN_cohort_data/noncoding/noncoding_rare/EP_ATAC_intersect_final_output_nodups.bed", col_names = FALSE, col_types = cols(
X1 = col_character(), # Chromosome
X2 = col_double(), # Start Position
X3 = col_double() # End Position
))
Plot noncoding interval size histogram
# Calculate Interval Sizes
bed_data$interval_size <- bed_data$X3 - bed_data$X2
# Calculate the median of interval sizes
median_interval_size <- median(bed_data$interval_size, na.rm = TRUE)
# plot
interval_size_histogram <- ggplot(bed_data, aes(x = interval_size)) +
geom_histogram(binwidth = 100, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = median_interval_size), color = "black", linetype = "dashed", size = 1) +
xlab("Interval Size") +
ylab("Frequency") +
ggtitle("Histogram of Interval Sizes") +
theme_cowplot(12)
interval_size_histogram
Report the central tendancy
mean(bed_data$interval_size)
## [1] 497.4085
median(bed_data$interval_size)
## [1] 396
Now pull in the ATAC data that maps to genes of interest, including the promoter fragment the ATAC data mapped to
# Read the BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/EP_ATAC_intersect_final_output_annotated_unique.bed", col_names = c("Chromosome", "Start", "End", "Chromosome2", "GeneStart", "GeneEnd", "GeneNames"), col_types = cols(
Chromosome = col_character(),
Start = col_double(),
End = col_double(),
Chromosome2 = col_character(),
GeneStart = col_double(),
GeneEnd = col_double(),
GeneNames = col_character()
))
# Split the gene names in case there are multiple genes separated by semicolon
bed_data <- bed_data %>%
mutate(GeneNames = strsplit(GeneNames, ";")) %>%
unnest(GeneNames)
# Read the gene list
gene_list <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Lorenzo_panel_genes_simple.txt")
# Filter bed_data to keep only rows with genes in the gene list
bed_data <- bed_data %>%
filter(GeneNames %in% gene_list)
# Count the number of regions mapped to each gene
gene_count <- bed_data %>%
count(GeneNames)
print(gene_count)
## # A tibble: 347 × 2
## GeneNames n
## <chr> <int>
## 1 A2ML1 15
## 2 ABAT 45
## 3 ABCC9 35
## 4 ACADVL 3
## 5 ACTC1 9
## 6 ACTL6B 13
## 7 ACTN2 21
## 8 ADAM22 3
## 9 ADSL 35
## 10 AGL 59
## # ℹ 337 more rows
Plot histogram that is number of regions that map to the genes of interest
# Calculate the median
median_regions <- median(gene_count$n, na.rm = TRUE)
# plot
regions_per_gene <- ggplot(gene_count, aes(x = n)) +
geom_histogram(binwidth = 1, fill = "#B40F20", color = "#B40F20") +
geom_vline(aes(xintercept = median_regions), color = "black", linetype = "dashed", size = 1) +
xlab("Number of Regions") +
ylab("Frequency") +
theme_cowplot(12) +
ggtitle("Histogram of the Distribution of Number of Regions per Gene")
regions_per_gene
Report the number of genes per region
mean(gene_count$n)
## [1] 31.40922
median(gene_count$n)
## [1] 24
Report the region sizes
bed_data$region_size <- bed_data$End - bed_data$Start
# Sum region sizes for each gene
total_region_size_per_gene <- bed_data %>%
group_by(GeneNames) %>%
summarise(TotalRegionSize = sum(region_size))
mean(total_region_size_per_gene$TotalRegionSize)
## [1] 16239.12
median(total_region_size_per_gene$TotalRegionSize)
## [1] 12010
Plot total sequence space per gene
# Create histogram of Total Region Sizes
ggplot(total_region_size_per_gene, aes(x = TotalRegionSize)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
xlab("Total Region Size") +
ylab("Frequency") +
ggtitle("Histogram of Total Region Sizes per Gene")
Pull information about the TSS for each gene
gene_list <- unique(bed_data$GeneNames)
# Select hg19 from Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://grch37.ensembl.org")
# Get TSS positions along with strand information
tss_info <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
filters = 'hgnc_symbol',
values = gene_list,
mart = ensembl)
find TSS distance
# Make function to select the most upstream TSS based on strand
select_upstream_tss <- function(df) {
if (all(df$strand == 1)) {
return(data.frame(transcription_start_site = min(df$transcription_start_site))) # Forward strand
} else {
return(data.frame(transcription_start_site = max(df$transcription_start_site))) # Reverse strand
}
}
# Apply the function to each group of genes
tss_upstream <- tss_info %>%
group_by(hgnc_symbol) %>%
do(select_upstream_tss(.))
# Merge bed_data with tss_upstream on gene names
merged_data <- merge(bed_data, tss_upstream, by.x = "GeneNames", by.y = "hgnc_symbol")
# Calculate the midpoint (peak center) of each interval
merged_data$Center <- (merged_data$Start + merged_data$End) / 2
# Calculate the distance
merged_data$DistanceToTSS <- abs(merged_data$Center - merged_data$transcription_start_site)
Distance to TSS
mean(merged_data$DistanceToTSS)
## [1] 264213.1
median(merged_data$DistanceToTSS)
## [1] 176294
Plot distance to TSS histogram
# Create a histogram
median_distance <- median(merged_data$DistanceToTSS, na.rm = TRUE)
dist_tss <- ggplot(merged_data, aes(x = DistanceToTSS)) +
geom_histogram(binwidth = 0.1, fill = "#FFB3BA", color = "black") +
geom_vline(aes(xintercept = median_distance), color = "black", linetype = "dashed", size = 1) +
xlab("Distance to TSS (bp)") +
ylab("Frequency") +
theme_cowplot(12) +
ggtitle("Histogram of Distances to Transcription Start Sites (TSS)") +
scale_x_log10(breaks = 10^(0:6) * 10000, labels = scales::comma)
dist_tss
Pull all TSSs
# Retrieve TSS information for all genes along with HGNC symbols
all_tss_info <- getBM(
attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
mart = ensembl
)
# Filter out rows where hgnc_symbol is blank
filtered_all_tss_info <- all_tss_info %>%
filter(hgnc_symbol != "")
See if the peak is closer to another TSS than the contacted one!
# Function to select the most upstream TSS along with chromosome information based on strand, just like before, but for all genes
select_upstream_tss <- function(df) {
if (nrow(df) == 0) {
return(data.frame(chromosome_name = NA, transcription_start_site = NA)) # Return NA if there are no rows
}
if (all(df$strand == 1)) {
# Forward strand
tss <- min(df$transcription_start_site, na.rm = TRUE)
} else {
# Reverse strand
tss <- max(df$transcription_start_site, na.rm = TRUE)
}
chromosome <- df$chromosome_name[which.min(abs(df$transcription_start_site - tss))]
return(data.frame(chromosome_name = chromosome, transcription_start_site = tss))
}
# Apply the function to each group of genes
all_tss_upstream <- filtered_all_tss_info %>%
group_by(hgnc_symbol) %>%
do(select_upstream_tss(.)) %>%
ungroup()
# Prepend 'chr' to chromosome names
all_tss_upstream$chromosome_name <- paste0("chr", all_tss_upstream$chromosome_name)
Find the closest transcription start site (TSS) to each peak center and calculate the distance.
# Create GRanges object for all_tss_upstream
tss_ranges <- GRanges(
seqnames = all_tss_upstream$chromosome_name,
ranges = IRanges(start = all_tss_upstream$transcription_start_site, end = all_tss_upstream$transcription_start_site)
)
# Create GRanges object for merged_data
merged_data_ranges <- GRanges(
seqnames = merged_data$Chromosome,
ranges = IRanges(start = merged_data$Center, end = merged_data$Center)
)
# Find the nearest TSS for each midpoint (peak center)
nearest_hits <- nearest(merged_data_ranges, tss_ranges)
# Extract the closest TSS information
closest_tss_info <- all_tss_upstream[nearest_hits, ]
# Add closest TSS information to merged_data
merged_data$closest_gene <- closest_tss_info$hgnc_symbol
merged_data$closest_tss <- closest_tss_info$transcription_start_site
merged_data$closest_tss_chromosome <- closest_tss_info$chromosome_name
# Calculate distance to closest TSS
merged_data$distance_to_closest_tss <- abs(merged_data$Center - merged_data$closest_tss)
Check if the closest TSS gene matches the originally contacted gene
# Compare GeneNames with closest_gene and append
merged_data$gene_match <- merged_data$GeneNames == merged_data$closest_gene
Compute and print the percentage of matches and mismatches between the contacted gene and the closest TSS gene.
# Calculate the fraction of matches
fraction_match <- sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data)
fraction_mismatch <- 1-(sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data))
# Print the fraction
print(100*fraction_match)
## [1] 13.26727
print(100*fraction_mismatch)
## [1] 86.73273
Lets see which genes have regions mapped to them that are closer to other TSSs
# Aggregate data by GeneNames
gene_summary <- merged_data %>%
dplyr::group_by(GeneNames) %>%
dplyr::summarize(
any_true = any(gene_match, na.rm = TRUE),
any_false = any(!gene_match, na.rm = TRUE),
all_true = all(gene_match, na.rm = TRUE),
all_false = all(!gene_match, na.rm = TRUE)
) %>%
ungroup()
# Adjusted calculations to include 8 missing genes
adjusted_denominator <- nrow(gene_summary) + 16
percent_any_true <- sum(gene_summary$any_true) / adjusted_denominator * 100
number_any_true <- sum(gene_summary$any_true)
percent_any_false <- sum(gene_summary$any_false) / adjusted_denominator * 100
number_any_false <- sum(gene_summary$any_false)
percent_all_true <- sum(gene_summary$all_true) / adjusted_denominator * 100
number_all_true <- sum(gene_summary$all_true)
percent_all_false <- sum(gene_summary$all_false) / adjusted_denominator * 100
number_all_false <- sum(gene_summary$all_false)
percent_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) / adjusted_denominator * 100
number_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false)
# Print the results
cat("Total genes in panel", 363 ,"\n")
## Total genes in panel 363
cat("Genes with no ATAC peaks:", 100*(16/363),"%, ",16,"\n")
## Genes with no ATAC peaks: 4.407713 %, 16
cat("Genes with at least one TRUE:", percent_any_true,"%, ", number_any_true,"\n")
## Genes with at least one TRUE: 42.69972 %, 155
cat("Genes with at least one FALSE:", percent_any_false,"%, ", number_any_false,"\n")
## Genes with at least one FALSE: 93.66391 %, 340
cat("Genes with only TRUE:", percent_all_true,"%, ", number_all_true,"\n")
## Genes with only TRUE: 1.928375 %, 7
cat("Genes with only FALSE:", percent_all_false,"%, ", number_all_false,"\n")
## Genes with only FALSE: 52.89256 %, 192
cat("Genes with both TRUE and FALSE:", percent_both_true_false,"%, ", number_both_true_false,"\n")
## Genes with both TRUE and FALSE: 40.77135 %, 148
input the count data
noncoding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/noncoding_frequency_by_individual_for_manuscript_fixed.txt", header = TRUE, sep = "\t")
Categorize the data
categorized_noncoding_stats <- noncoding_stats %>%
mutate(Category_Group = case_when(
Category == 1 ~ "Control",
Category %in% 2:6 ~ "Case"
))
Aggregate and clean
#clean it up to remove inadvertent NAs or infs
categorized_noncoding_stats <- categorized_noncoding_stats %>%
dplyr::select(-ID, -Category) %>%
na.omit() %>%
dplyr::filter_all(all_vars(!is.infinite(.)))
# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_noncoding_stats <- categorized_noncoding_stats %>%
group_by(Category_Group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
std = ~sd(., na.rm = TRUE))))
# Remove rows with NA values
collapsed_noncoding_stats <- na.omit(collapsed_noncoding_stats)
Perform permutation test
# List of variables to analyze
noncoding_variables_to_analyze <- c("variant_count", "Epilepsy_gnomAD.0.001","Epilepsy_ultrarare","Cardiomyopathy_gnomAD.0.001","Cardiomyopathy_ultrarare")
# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
# Calculate the observed difference in means between the two groups
observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"]) -
mean(data[[var]][data[[group_var]] == "Control"]))
# Create an empty vector to store the permutation statistics
perm_stats <- numeric(num_permutations)
# Perform the permutations
for (i in 1:num_permutations) {
# Randomly shuffle the group labels
permuted_group <- sample(data[[group_var]])
# Calculate the difference in means for the permuted data
permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"]) -
mean(data[[var]][permuted_group == "Control"]))
# Store the permuted statistic
perm_stats[i] <- permuted_stat
}
# Calculate the p-value (proportion of permuted stats >= observed stat)
p_value <- mean(perm_stats >= observed_stat)
return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
}
# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)
# Loop through each variable and perform the permutation test
for (var in noncoding_variables_to_analyze) {
# Run the permutation test
test_result <- permutation_test(categorized_noncoding_stats, var, "Category_Group")
# Extract the permuted statistics, observed statistic, and p-value
perm_stats <- test_result$perm_stats
observed_stat <- test_result$observed_stat
p_value <- test_result$p_value
# Store the p-value and observed statistic in the dataframe
p_value_results <- rbind(p_value_results, data.frame(Variable = var,
Observed_Stat = observed_stat,
p_value = p_value))
# Create a data frame for plotting the permutation distribution
perm_df <- data.frame(perm_stats = perm_stats)
# Plot the permutation distribution with the observed statistic marked and p-value
p <- ggplot(perm_df, aes(x = perm_stats)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
labs(title = paste("Permutation Test for", var),
x = "Permuted Statistic",
y = "Frequency",
subtitle = paste("Observed Statistic =", round(observed_stat, 4),
"| P-value =", format(p_value, digits = 4))) + # Add p-value to subtitle
theme_minimal()
# Print the plot
print(p)
}
# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
## Variable Observed_Stat p_value
## 1 variant_count 133.718673 0.000
## 2 Epilepsy_gnomAD.0.001 1.972116 0.013
## 3 Epilepsy_ultrarare 9.608424 0.000
## 4 Cardiomyopathy_gnomAD.0.001 5.523130 0.000
## 5 Cardiomyopathy_ultrarare 7.756665 0.000
Define a function to compute the ratio of each column’s mean relative to the Control group and calculate confidence intervals
# Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
# Calculate the mean and standard deviation for Group 1
group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
# Calculate the ratio for each group relative to Group 1
data_summary <- data %>%
group_by(Category_Group) %>%
summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
sd_value = sd(.data[[col_name]], na.rm = TRUE),
n = n()) %>%
mutate(ratio = mean_value / group1_mean,
sem_value = sd_value / sqrt(n), # SEM calculation
sem_ratio = sem_value / group1_mean, # SEM ratio
ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
return(list(summary_data = data_summary))
}
Iterate over selected columns, compute summary statistics, and store results in a dataframe.
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_noncoding_stats), c("ID", "Category", "Category_Group"))
# Initialize an empty dataframe to store summary data
combined_noncoding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean =
numeric(), std = numeric(), stringsAsFactors = FALSE)
# Loop through each column and plot relative to Category_Group1
for (col in columns_to_plot) {
plot_data <- plot_column_ratio(categorized_noncoding_stats, col)
# Append summary data to combined_noncoding_stats_summary_df
combined_noncoding_stats_summary_df <- bind_rows(combined_noncoding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
Filter and factorize selected noncoding variants, then generate a plot showing their mean ratios compared to the Control group
# Subset the data for the specified variables
selected_noncoding_data <- combined_noncoding_stats_summary_df %>%
filter(col_name %in% c(
"Cardiomyopathy_ultrarare",
"Epilepsy_ultrarare"),
Category_Group != "Control")
# Define the specific order for noncoding variants
levels_order <- c(
"Epilepsy_ultrarare",
"Cardiomyopathy_ultrarare"
)
# Factorize the 'col_name' column with the specified order
selected_noncoding_data$col_name <- factor(selected_noncoding_data$col_name, levels = levels_order)
# Plot
noncoding_stats_plot <- ggplot(selected_noncoding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
geom_point(position = position_dodge(width = 1), size = 3) +
geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 0.5) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_color_manual(values = "#FF6464") +
labs(title = "Ratio Compared to Group 1 +/-SEM",
y = "Variant",
x = "Ratio to Group 1 Mean",
color = "Category Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8, hjust = 1),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
plot.title = element_text(size = 8, hjust = 0.5),
plot.subtitle = element_text(size = 8),
plot.caption = element_text(size = 8),
plot.margin = margin(15, 15, 15, 15)) +
scale_x_continuous(limits = c(0.75, 2))
print(noncoding_stats_plot)
Input the data (change to your path)
# Pull in the gene data
gene_data_noncoding <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_noncoding_variants_by_gene_for_manuscript_fixed.csv")
# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript_fixed.txt", sep = "\t", header = TRUE)
# add the arrest category to each
gene_data_noncoding$Category <- cohort$arrest_ontology[match(gene_data_noncoding$ID, cohort$CGM_id)]
#filter for discovery only
gene_data_noncoding_discovery <- gene_data_noncoding %>%
filter(Set == "Discovery")
Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize
variants_per_gene_Cat_2_6 <- gene_data_noncoding_discovery %>%
filter(Category > 1) %>%
group_by(GENE) %>%
summarise(
noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_2_6)
## # A tibble: 424 × 2
## GENE noncoding_ultrarare
## <chr> <int>
## 1 A2ML1 95
## 2 AAMP 10
## 3 ABAT 64
## 4 ABCC9 0
## 5 AC011551.3 0
## 6 ACADVL 27
## 7 ACTC1 31
## 8 ACTL6B 172
## 9 ACTN2 28
## 10 ADAM22 0
## # ℹ 414 more rows
Filter for Category == 1 and then group and summarize
variants_per_gene_Cat_1 <- gene_data_noncoding_discovery %>%
filter(Category == 1) %>%
group_by(GENE) %>%
summarise(
noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_1)
## # A tibble: 424 × 2
## GENE noncoding_ultrarare
## <chr> <int>
## 1 A2ML1 37
## 2 AAMP 1
## 3 ABAT 50
## 4 ABCC9 0
## 5 AC011551.3 0
## 6 ACADVL 9
## 7 ACTC1 7
## 8 ACTL6B 130
## 9 ACTN2 16
## 10 ADAM22 0
## # ℹ 414 more rows
Append panel source to the gene_data
# Ensure that the 'GENE' column in 'gene_data_noncoding_discovery' and 'Gene' in 'all_genes' are character type
gene_data_noncoding_discovery$GENE <- as.character(gene_data_noncoding_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)
# Find the index of each gene in 'all_genes'
# Then, use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_noncoding_discovery$Source <- all_genes$Source[match(gene_data_noncoding_discovery$GENE, all_genes$Gene)]
# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'
# View the first few rows to verify the new 'Source' has been added
head(gene_data_noncoding_discovery)
## ID GENE noncoding_ultrarare Set X X.1 Category
## 1 CGM0000969 RANGRF 0 Discovery NA NA 2
## 2 CGM0000969 AGL 0 Discovery NA NA 2
## 3 CGM0000969 RNF13 0 Discovery NA NA 2
## 4 CGM0000969 PRKAG2 0 Discovery NA NA 2
## 5 CGM0000969 JPH2 1 Discovery NA NA 2
## 6 CGM0000969 RP11-156P1.2 0 Discovery NA NA 2
## Source
## 1 Cardiomyopathy
## 2 Cardiomyopathy
## 3 Epilepsy
## 4 Cardiomyopathy
## 5 Cardiomyopathy
## 6 <NA>
Add the source to the category 1 variants list
variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_1)
## # A tibble: 6 × 3
## GENE noncoding_ultrarare Source
## <chr> <int> <chr>
## 1 A2ML1 37 Cardiomyopathy
## 2 AAMP 1 <NA>
## 3 ABAT 50 Epilepsy
## 4 ABCC9 0 Cardiomyopathy
## 5 AC011551.3 0 <NA>
## 6 ACADVL 9 Cardiomyopathy
Add the source to the category 2-6 variants list
variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_2_6)
## # A tibble: 6 × 3
## GENE noncoding_ultrarare Source
## <chr> <int> <chr>
## 1 A2ML1 95 Cardiomyopathy
## 2 AAMP 10 <NA>
## 3 ABAT 64 Epilepsy
## 4 ABCC9 0 Cardiomyopathy
## 5 AC011551.3 0 <NA>
## 6 ACADVL 27 Cardiomyopathy
combine the variants together now
# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
mutate(Category = "1")
# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
mutate(Category = "2-6")
# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)
# Print the combined dataset
print(combined_variants)
## # A tibble: 848 × 4
## GENE noncoding_ultrarare Source Category
## <chr> <int> <chr> <chr>
## 1 A2ML1 37 Cardiomyopathy 1
## 2 AAMP 1 <NA> 1
## 3 ABAT 50 Epilepsy 1
## 4 ABCC9 0 Cardiomyopathy 1
## 5 AC011551.3 0 <NA> 1
## 6 ACADVL 9 Cardiomyopathy 1
## 7 ACTC1 7 Cardiomyopathy 1
## 8 ACTL6B 130 Epilepsy 1
## 9 ACTN2 16 Cardiomyopathy 1
## 10 ADAM22 0 Epilepsy 1
## # ℹ 838 more rows
Plot the number of noncoding variant types for the genes
# Filter dataset where noncoding_ultrarare > 0
noncoding_ultrarare_data <- combined_variants %>%
filter(noncoding_ultrarare > 0) %>%
filter(!is.na(Source))
# Label function for noncoding_ultrarare plot
custom_labeller_noncoding <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Create the noncoding_ultrarare plot
noncoding_ultrarare_plot <- ggplot(noncoding_ultrarare_data, aes(x = GENE, y = Category, fill = noncoding_ultrarare)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "Noncoding Ultrarare Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller_noncoding("Category", x))
# Print the plot
print(noncoding_ultrarare_plot)
Extract pLI constraint scores from gnomAD, compute enrichment stats for noncoding ultrarare variants, and plot their significance relative to pLI
# gnomAD_data pLI constraint data
gnomad_data <- fread("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/gnomad.v2.1.1.lof_metrics.by_gene.txt")
# Function to fetch pLI
get_pLI <- function(gene_symbol, gnomad_data) {
result <- gnomad_data[gene == gene_symbol, .(gene, pLI)]
if (nrow(result) == 0) {
stop("Gene not found in gnomAD data.")
}
return(result)
}
# Initialize an empty data frame to store the results
final_merged_data <- data.frame(GENE = character(), p_value = numeric(), pLI = numeric(), Source = character(), ratio = numeric(), stringsAsFactors = FALSE)
# Process each panel "Source" group separately
for (source in unique(noncoding_ultrarare_data$Source)) {
# Filter data by panel "Source"
source_data <- noncoding_ultrarare_data %>% filter(Source == source)
# Extract unique genes for this Source
unique_genes <- unique(source_data$GENE)
# Create a data frame to store pLI values
pli_values <- data.frame(GENE = character(), pLI = numeric(), stringsAsFactors = FALSE)
# Fetch pLI values for each gene
for (gene in unique_genes) {
try({
pli_value <- get_pLI(gene, gnomad_data)
pli_values <- rbind(pli_values, data.frame(GENE = gene, pLI = pli_value$pLI, stringsAsFactors = FALSE))
}, silent = TRUE)
}
# Compute p-values and ratios for each gene
p_values <- source_data %>%
group_by(GENE) %>%
summarise(
p_value = {
control_count <- sum(noncoding_ultrarare[Category == 1], na.rm = TRUE)
case_count <- sum(noncoding_ultrarare[Category == "2-6"], na.rm = TRUE)
total_count <- sum(noncoding_ultrarare, na.rm = TRUE)
expected_control <- total_count * (537 / (537 + 506))
expected_case <- total_count * (506 / (537 + 506))
chisq_stat <- ((control_count - expected_control)^2 / expected_control) +
((case_count - expected_case)^2 / expected_case)
pchisq(chisq_stat, df = 1, lower.tail = FALSE)
},
ratio = {
control_count <- sum(noncoding_ultrarare[Category == 1], na.rm = TRUE)
case_count <- sum(noncoding_ultrarare[Category == "2-6"], na.rm = TRUE)
if (is.na(control_count) | is.na(case_count) | control_count == 0) {
NA
} else {
(case_count/506) / (control_count/537)
}
}
)
# Merge with pLI values
merged_data <- merge(p_values, pli_values, by = "GENE")
merged_data$Source <- source
# Append to the final data frame
final_merged_data <- rbind(final_merged_data, merged_data)
}
# Calculate the Bonferroni correction cutoff
num_tests <- nrow(final_merged_data)
bonferroni_cutoff <- 0.05 / num_tests
# Filter for Bonferroni corrected p-values
bonferroni_filtered_data <- final_merged_data %>%
filter(p_value < bonferroni_cutoff)
# Filter for the top 12 lowest p-values within each source group
top10_labels <- bonferroni_filtered_data %>%
group_by(Source) %>%
slice_min(order_by = p_value, n = 12) %>%
ungroup()
#Filter for only Cardio
cardio_data <- bonferroni_filtered_data %>%
filter(Source == "Cardiomyopathy")
# Plot
reg_variant_plot <- ggplot(cardio_data, aes(y = -log10(p_value), x = pLI, color = ratio)) +
geom_point() +
geom_text(data = top10_labels, aes(label = GENE), hjust = 0.5, vjust = -0.5) +
scale_color_gradient(low = "#3C8C78", high = "#dcb43c", na.value = "grey") +
labs(title = paste("P-Values of Noncoding Ultrarare Variants vs. pLI by Source (Bonferroni, p <", format(bonferroni_cutoff, digits = 2), ")"),
x = "pLI",
y = "-log10(p-value)",
color = "Ratio\n(Categories 2-6 / Category 1)") +
theme_cowplot(12)
# Print the plot
print(reg_variant_plot)
This is added from HOMER results
motif_data <- data.frame(
TranscriptionFactor = c("IRF4", "NFY", "NKX2.5", "PRDM1", "SMAD2", "ESRRA", "NFKB","ASCL1"),
PValue = c(1.00E-23, 1.00E-19, 1.00E-18, 1.00E-17, 1.00E-17, 1.00E-17, 1.00E-15, 1.00E-13)
)
# Create the plot
motif_plot <- ggplot(motif_data, aes(y = TranscriptionFactor, x = -log10(PValue))) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Transcription Factor P-values",
y = "Transcription Factor",
x = "-log10(P-value)") +
theme_cowplot(12)
motif_plot
Combined****************************************************************************************************************** Read the cohort data
# Read the cohort data
total_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/combined_data_for_manuscript_fixed_NEW.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)
combined_data <- total_data %>% filter(Set =="Discovery")
replication_data <- total_data %>% filter(Set =="Replication")
Categorize the Data based on original categories and arrest status
#Clump by category
categorized_combined_data <- combined_data %>%
mutate(Category = case_when(
arrest_group == 1 ~ "Control",
arrest_group %in% 2:6 ~ "zCase"
)) %>%
filter(!is.na(Category))
#Convert Category to a factor
categorized_combined_data$Category <- as.factor(categorized_combined_data$Category)
############################ ############## ############## replication
#Clump replication by category z used for formatting reasons I am too lazy to fix
categorized_replication_data <- replication_data %>%
mutate(Category = case_when(
arrest_group == 1 ~ "Control",
arrest_group %in% 2:6 ~ "zCase"
)) %>%
filter(!is.na(Category))
# Convert Category to a factor
categorized_replication_data$Category <- as.factor(categorized_replication_data$Category)
Collapse the rare vars
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))
############################ ############## ############## replication
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))
Perform multivariate Logistic Regression on everything
model <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model
##
## Call: glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval +
## Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
## Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI +
## Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare +
## Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy +
## Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
## Cardiomyopathy_null + Epilepsy_null, family = binomial(),
## data = categorized_combined_data)
##
## Coefficients:
## (Intercept) Normalized_Heart_rate
## -1.13276 -0.06633
## Normalized_PR_interval Normalized_QTc
## 0.10545 -0.06282
## Normalized_BP Normalized_QRS
## 0.29888 0.41302
## Normalized_LVEF Normalized_LVESV
## -0.17686 0.16828
## Normalized_SVI Normalized_Afib
## 0.05697 -0.07965
## Normalized_LVEDVI Normalized_SV
## -0.09490 -0.09174
## Epilepsy_rare Epilepsy_ultrarare
## -0.06083 -0.01102
## Cardiomyopathy_rare Cardiomyopathy_ultrarare
## -0.03503 -0.01155
## PLP_Epilepsy PLP_Cardiomyopathy
## -0.19700 1.18960
## Cardiomyopathy_noncoding_rare Epilepsy_noncoding_rare
## -0.01056 0.03214
## Cardiomyopathy_null Epilepsy_null
## 0.96647 0.44278
##
## Degrees of Freedom: 992 Total (i.e. Null); 971 Residual
## Null Deviance: 1370
## Residual Deviance: 1132 AIC: 1176
Compute the scores
# tell it how to make the scores
scores <- predict(model, type = "link")
scores_replication <- predict(model, newdata = categorized_replication_data, type = "link")
# Add scores to the dataframes
categorized_combined_data$scores <- scores
categorized_replication_data$scores_replication <- scores_replication
Perform multivariate Logistic Regressions based only on the input paramaters and subsets
model_cardiomyopathy_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, data = categorized_combined_data, family = binomial())
model_epilepsy_rare <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, data = categorized_combined_data, family = binomial())
model_noncoding_rare <- glm(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, data = categorized_combined_data, family = binomial())
model_common <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())
model_coding_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, data = categorized_combined_data, family = binomial())
model_epilepsy_and_common <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())
model_cardiac_and_common <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())
model_common_and_coding <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, family = binomial())
Compute the rest of the scores and determine rank-based quintiles for all scores
categorized_combined_data$cardiac_variant_score <- predict(model_cardiomyopathy_rare, type = "link")
categorized_combined_data$epilepsy_variant_score <- predict(model_epilepsy_rare, type = "link")
categorized_combined_data$noncoding_variant_score <- predict(model_noncoding_rare, type = "link")
categorized_combined_data$common_variant_score <- predict(model_common, type = "link")
categorized_combined_data$common_and_coding_score <- predict(model_common_and_coding, type = "link")
categorized_combined_data$coding_rare_score <- predict(model_coding_rare, type = "link")
categorized_combined_data$epilepsy_and_common_score <- predict(model_epilepsy_and_common, type = "link")
categorized_combined_data$cardiac_and_common_score <- predict(model_cardiac_and_common, type = "link")
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank = rank(scores, ties.method = "first"),
score_quintiles = ceiling(rank / (n() / 5)),
rank_cardiac = rank(cardiac_variant_score, ties.method = "first"),
cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
rank_epilepsy = rank(epilepsy_variant_score, ties.method = "first"),
epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
rank_noncoding = rank(noncoding_variant_score, ties.method = "first"),
noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
rank_common = rank(common_variant_score, ties.method = "first"),
common_score_quintiles = ceiling(rank_common / (n() / 5)),
rank_common_and_coding = rank(common_and_coding_score, ties.method = "first"),
common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
rank_model_coding_rare = rank(coding_rare_score, ties.method = "first"),
coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
rank_epilepsy_and_common = rank(epilepsy_and_common_score, ties.method = "first"),
epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
rank_cardiac_and_common = rank(cardiac_and_common_score, ties.method = "first"),
cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
)
categorized_combined_data <- categorized_combined_data %>%
mutate(
probability = plogis(scores),
cardiac_probability = plogis(cardiac_variant_score),
epilepsy_probability = plogis(epilepsy_variant_score),
noncoding_probability = plogis(noncoding_variant_score),
common_probability = plogis(common_variant_score),
common_and_coding_probability = plogis(common_and_coding_score),
coding_rare_probability = plogis(coding_rare_score),
epilepsy_and_common_probability = plogis(epilepsy_and_common_score),
cardiac_and_common_probability = plogis(cardiac_and_common_score),
)
# Function to calculate odds ratio and CI
calculate_odds_ratios <- function(data, category_column, quintile_column) {
# Compute counts and initial odds for each quintile
odds_data <- data %>%
dplyr::group_by({{ quintile_column }}) %>%
dplyr::summarise(
count_category_1 = sum({{ category_column }} == "Control", na.rm = TRUE),
count_category_2_6 = sum({{ category_column }} == "zCase", na.rm = TRUE),
.groups = 'drop'
) %>%
dplyr::mutate(
odds = count_category_2_6 / count_category_1
)
# Retrieve the odds of the first quintile as the reference
first_quintile_odds <- odds_data$odds[1]
# Calculate the odds ratio relative to the first quintile
odds_data <- odds_data %>%
dplyr::mutate(
odds_ratio = odds / first_quintile_odds,
se_log_odds_ratio = sqrt(1 / count_category_1 + 1 / count_category_2_6),
lower_ci = exp(log(odds_ratio) - 1.96 * se_log_odds_ratio),
upper_ci = exp(log(odds_ratio) + 1.96 * se_log_odds_ratio)
)
return(odds_data)
}
# Apply function to each odds category
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
cardiac_odds_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_score_quintiles)
epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_score_quintiles)
common_summary = calculate_odds_ratios(categorized_combined_data,Category, common_score_quintiles)
noncoding_summary = calculate_odds_ratios(categorized_combined_data,Category, noncoding_score_quintiles)
common_and_coding_summary = calculate_odds_ratios(categorized_combined_data,Category, common_and_coding_score_quintiles)
coding_rare_summary = calculate_odds_ratios(categorized_combined_data,Category, coding_rare_score_quintiles)
commmon_and_epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_and_common_score_quintiles)
################################################################################################################ replication
categorized_replication_data$cardiac_variant_score_replication <- predict(model_cardiomyopathy_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$epilepsy_variant_score_replication <- predict(model_epilepsy_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$noncoding_variant_score_replication <- predict(model_noncoding_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$common_variant_score_replication <- predict(model_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data$common_and_coding_score_replication <- predict(model_common_and_coding, newdata = categorized_replication_data, type = "link")
categorized_replication_data$coding_rare_score_replication <- predict(model_coding_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$epilepsy_and_common_score_replication <- predict(model_epilepsy_and_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data$cardiac_and_common_score_replication <- predict(model_cardiac_and_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data <- categorized_replication_data %>%
mutate(
rank = rank(scores_replication, ties.method = "first"),
score_quintiles = ceiling(rank / (n() / 5)),
rank_cardiac = rank(cardiac_variant_score_replication, ties.method = "first"),
cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
rank_epilepsy = rank(epilepsy_variant_score_replication, ties.method = "first"),
epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
rank_noncoding = rank(noncoding_variant_score_replication, ties.method = "first"),
noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
rank_common = rank(common_variant_score_replication, ties.method = "first"),
common_score_quintiles = ceiling(rank_common / (n() / 5)),
rank_common_and_coding = rank(common_and_coding_score_replication, ties.method = "first"),
common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
rank_model_coding_rare = rank(coding_rare_score_replication, ties.method = "first"),
coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
rank_epilepsy_and_common = rank(epilepsy_and_common_score_replication, ties.method = "first"),
epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
rank_cardiac_and_common = rank(cardiac_and_common_score_replication, ties.method = "first"),
cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
)
categorized_replication_data <- categorized_replication_data %>%
mutate(
probability = plogis(scores_replication),
cardiac_probability = plogis(cardiac_variant_score_replication),
epilepsy_probability = plogis(epilepsy_variant_score_replication),
noncoding_probability = plogis(noncoding_variant_score_replication),
common_probability = plogis(common_variant_score_replication),
common_and_coding_probability = plogis(common_and_coding_score_replication),
coding_rare_probability = plogis(coding_rare_score_replication),
epilepsy_and_common_probability = plogis(epilepsy_and_common_score_replication),
cardiac_and_common_probability = plogis(cardiac_and_common_score_replication),
)
# Apply function to each odds category
combined_odds_summary_replication = calculate_odds_ratios(categorized_replication_data, Category, score_quintiles)
cardiac_odds_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_score_quintiles)
epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_score_quintiles)
common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, common_score_quintiles)
noncoding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, noncoding_score_quintiles)
common_and_coding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, common_and_coding_score_quintiles)
coding_rare_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, coding_rare_score_quintiles)
commmon_and_epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_and_common_score_quintiles)
plot
plot(log2(combined_odds_summary$odds_ratio),
ylim = c(
log2(min(c(combined_odds_summary$lower_ci, cardiac_odds_summary$lower_ci, epilepsy_summary$lower_ci,
common_summary$lower_ci, noncoding_summary$lower_ci, common_and_coding_summary$lower_ci,
coding_rare_summary$lower_ci))),
log2(max(c(combined_odds_summary$upper_ci, cardiac_odds_summary$upper_ci, epilepsy_summary$upper_ci,
common_summary$upper_ci, noncoding_summary$upper_ci, common_and_coding_summary$upper_ci,
coding_rare_summary$upper_ci)))
),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
# Add error bars for 95% CI - Combined
xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary$odds_ratio), log2(combined_odds_summary$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary$odds_ratio),
y0 = log2(combined_odds_summary$lower_ci),
y1 = log2(combined_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Cardiac
lines(1:length(cardiac_odds_summary$odds_ratio), log2(cardiac_odds_summary$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary$odds_ratio),
y0 = log2(cardiac_odds_summary$lower_ci),
y1 = log2(cardiac_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#2E86C1")
# Epilepsy
lines(1:length(epilepsy_summary$odds_ratio), log2(epilepsy_summary$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary$odds_ratio),
y0 = log2(epilepsy_summary$lower_ci),
y1 = log2(epilepsy_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#A93226")
# Common
lines(1:length(common_summary$odds_ratio), log2(common_summary$odds_ratio), col = "#229954")
points(log2(common_summary$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary$odds_ratio),
y0 = log2(common_summary$lower_ci),
y1 = log2(common_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#229954")
# Noncoding
lines(1:length(noncoding_summary$odds_ratio), log2(noncoding_summary$odds_ratio), col = "#D68910")
points(log2(noncoding_summary$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary$odds_ratio),
y0 = log2(noncoding_summary$lower_ci),
y1 = log2(noncoding_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#D68910")
# common and coding summary
lines(1:length(common_and_coding_summary$odds_ratio), log2(common_and_coding_summary$odds_ratio), col = "Black")
points(log2(common_and_coding_summary$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary$odds_ratio),
y0 = log2(common_and_coding_summary$lower_ci),
y1 = log2(common_and_coding_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Black")
# coding rare summary
lines(1:length(coding_rare_summary$odds_ratio), log2(coding_rare_summary$odds_ratio), col = "Pink")
points(log2(coding_rare_summary$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary$odds_ratio),
y0 = log2(coding_rare_summary$lower_ci),
y1 = log2(coding_rare_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Pink")
# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary$odds_ratio), labels = TRUE)
# legend
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"),
col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)
plot replication
plot(log2(combined_odds_summary_replication$odds_ratio),
ylim = c(-2,10),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
# Add error bars for 95% CI - Combined
xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary_replication$odds_ratio), log2(combined_odds_summary_replication$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary_replication$odds_ratio),
y0 = log2(combined_odds_summary_replication$lower_ci),
y1 = log2(combined_odds_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Cardiac
lines(1:length(cardiac_odds_summary_replication$odds_ratio), log2(cardiac_odds_summary_replication$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary_replication$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary_replication$odds_ratio),
y0 = log2(cardiac_odds_summary_replication$lower_ci),
y1 = log2(cardiac_odds_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#2E86C1")
# Epilepsy
lines(1:length(epilepsy_summary_replication$odds_ratio), log2(epilepsy_summary_replication$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary_replication$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary_replication$odds_ratio),
y0 = log2(epilepsy_summary_replication$lower_ci),
y1 = log2(epilepsy_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#A93226")
# Common
lines(1:length(common_summary_replication$odds_ratio), log2(common_summary_replication$odds_ratio), col = "#229954")
points(log2(common_summary_replication$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary_replication$odds_ratio),
y0 = log2(common_summary_replication$lower_ci),
y1 = log2(common_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#229954")
# Noncoding
lines(1:length(noncoding_summary_replication$odds_ratio), log2(noncoding_summary_replication$odds_ratio), col = "#D68910")
points(log2(noncoding_summary_replication$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary_replication$odds_ratio),
y0 = log2(noncoding_summary_replication$lower_ci),
y1 = log2(noncoding_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#D68910")
# common and coding summary
lines(1:length(common_and_coding_summary_replication$odds_ratio), log2(common_and_coding_summary_replication$odds_ratio), col = "Black")
points(log2(common_and_coding_summary_replication$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary_replication$odds_ratio),
y0 = log2(common_and_coding_summary_replication$lower_ci),
y1 = log2(common_and_coding_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Black")
# coding rare summary
lines(1:length(coding_rare_summary_replication$odds_ratio), log2(coding_rare_summary_replication$odds_ratio), col = "Pink")
points(log2(coding_rare_summary_replication$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary_replication$odds_ratio),
y0 = log2(coding_rare_summary_replication$lower_ci),
y1 = log2(coding_rare_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Pink")
# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary_replication$odds_ratio), labels = TRUE)
# legend
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"),
col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)
Z test for odds significance
compare_odds <- function(odds1, odds2) {
# Calculate the difference in log odds
log_odds1 <- log(odds1$odds_ratio)
log_odds2 <- log(odds2$odds_ratio)
delta_log_odds <- log_odds1 - log_odds2
# Calculate the standard error of the difference
se1 <- odds1$se_log_odds_ratio
se2 <- odds2$se_log_odds_ratio
se_delta_log_odds <- sqrt(se1^2 + se2^2)
# Z-test for each quintile
z_values <- delta_log_odds / se_delta_log_odds
p_values <- 2 * pnorm(abs(z_values), lower.tail = FALSE)
# Return a data frame with the results
return(data.frame(quintile = 1:length(p_values), p_values = p_values))
}
# Apply the function to compare 'coding_rare_summary', 'common_and_coding_summary', and 'combined_odds_summary'
p_values_coding_vs_common_coding <- compare_odds(coding_rare_summary, common_and_coding_summary)
p_values_coding_vs_model <- compare_odds(coding_rare_summary, combined_odds_summary)
p_values_common_coding_vs_model <- compare_odds(common_and_coding_summary, combined_odds_summary)
# Combine the data frames with an identifier for each comparison
p_values_coding_vs_common_coding$comparison <- "Coding vs Common and Coding"
p_values_coding_vs_model$comparison <- "Coding vs Model"
p_values_common_coding_vs_model$comparison <- "Common and Coding vs Model"
# Bind the rows together
combined_p_values <- bind_rows(p_values_coding_vs_common_coding,
p_values_coding_vs_model,
p_values_common_coding_vs_model)
# Convert quintile to a factor
combined_p_values$quintile <- factor(combined_p_values$quintile, levels = 1:5)
combined_p_values
## quintile p_values comparison
## 1 1 1.0000000000 Coding vs Common and Coding
## 2 2 0.0302326527 Coding vs Common and Coding
## 3 3 0.1655321102 Coding vs Common and Coding
## 4 4 0.0016031606 Coding vs Common and Coding
## 5 5 0.0376069426 Coding vs Common and Coding
## 6 1 1.0000000000 Coding vs Model
## 7 2 0.0459511700 Coding vs Model
## 8 3 0.0041855535 Coding vs Model
## 9 4 0.0006605028 Coding vs Model
## 10 5 0.0006065780 Coding vs Model
## 11 1 1.0000000000 Common and Coding vs Model
## 12 2 0.8875234087 Common and Coding vs Model
## 13 3 0.1454995597 Common and Coding vs Model
## 14 4 0.8097807932 Common and Coding vs Model
## 15 5 0.1582202334 Common and Coding vs Model
Calculate the scores per category and plot them
#plot
common_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = common_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2, 2) +
theme_cowplot(12)
cardiac_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = cardiac_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 1) +
theme_cowplot(12)
epilepsy_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = epilepsy_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 1) +
theme_cowplot(12)
noncoding_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = noncoding_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 0.75) +
theme_cowplot(12)
scores_plot <- ggplot(categorized_combined_data, aes(x = Category, y = scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-3, 3) +
theme_cowplot(12)
suppressWarnings(print(common_variant_score_plot))
suppressWarnings(print(cardiac_variant_score_plot))
suppressWarnings(print(epilepsy_variant_score_plot))
suppressWarnings(print(noncoding_variant_score_plot))
suppressWarnings(print(scores_plot))
replication
Now plot the Z normalized replication data
mean_common_replication <- mean(categorized_replication_data$common_variant_score_replication)
sd_common_replication <- sd(categorized_replication_data$common_variant_score_replication)
mean_cardiac_replication <- mean(categorized_replication_data$cardiac_variant_score_replication)
sd_cardiac_replication <- sd(categorized_replication_data$cardiac_variant_score_replication)
mean_epilepsy_replication <- mean(categorized_replication_data$epilepsy_variant_score_replication)
sd_epilepsy_replication <- sd(categorized_replication_data$epilepsy_variant_score_replication)
mean_noncoding_replication <- mean(categorized_replication_data$noncoding_variant_score_replication)
sd_noncoding_replication <- sd(categorized_replication_data$noncoding_variant_score_replication)
mean_scores_replication <- mean(categorized_replication_data$scores_replication)
sd_scores_replication <- sd(categorized_replication_data$scores_replication)
common_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (common_variant_score_replication - mean_common_replication) / sd_common_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
cardiac_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (cardiac_variant_score_replication - mean_cardiac_replication)/ sd_cardiac_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
epilepsy_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (epilepsy_variant_score_replication - mean_epilepsy_replication)/ sd_epilepsy_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
noncoding_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (noncoding_variant_score_replication - mean_noncoding_replication)/ sd_noncoding_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication - mean_scores_replication)/ sd_scores_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
suppressWarnings(print(common_variant_score_replication_plot))
suppressWarnings(print(cardiac_variant_score_replication_plot))
suppressWarnings(print(epilepsy_variant_score_replication_plot))
suppressWarnings(print(noncoding_variant_score_replication_plot))
suppressWarnings(print(scores_replication_plot))
Compute the wilcox.tests
wilcox.test_cardiomyopathy <- wilcox.test(cardiac_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_cardiomyopathy
##
## Wilcoxon rank sum test with continuity correction
##
## data: cardiac_variant_score by Category
## W = 76739, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy <- wilcox.test(epilepsy_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_epilepsy
##
## Wilcoxon rank sum test with continuity correction
##
## data: epilepsy_variant_score by Category
## W = 86944, p-value = 3.215e-15
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common <- wilcox.test(common_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_common
##
## Wilcoxon rank sum test with continuity correction
##
## data: common_variant_score by Category
## W = 77863, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding <- wilcox.test(noncoding_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_noncoding
##
## Wilcoxon rank sum test with continuity correction
##
## data: noncoding_variant_score by Category
## W = 101916, p-value = 5.208e-06
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined <- wilcox.test(scores ~ Category, data = categorized_combined_data)
wilcox.test_combined
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores by Category
## W = 57835, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
replication
wilcox.test_cardiomyopathy_replication <- wilcox.test(cardiac_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_cardiomyopathy_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: cardiac_variant_score_replication by Category
## W = 850, p-value = 3.419e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy_replication <- wilcox.test(epilepsy_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_epilepsy_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: epilepsy_variant_score_replication by Category
## W = 1286, p-value = 0.0006998
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common_replication <- wilcox.test(common_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_common_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: common_variant_score_replication by Category
## W = 1667, p-value = 0.1309
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding_replication <- wilcox.test(noncoding_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_noncoding_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: noncoding_variant_score_replication by Category
## W = 523, p-value = 1.213e-12
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined_replication <- wilcox.test(scores_replication ~ Category, data = categorized_replication_data)
wilcox.test_combined_replication
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores_replication by Category
## W = 498, p-value = 4.973e-13
## alternative hypothesis: true location shift is not equal to 0
Plot the distribution of the categories by score
distribution_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) +
geom_density(aes(y = ..density..), alpha = 0.5, adjust = 1) +
scale_fill_manual(values = group_colors) +
scale_x_continuous(limits = c(-3, 4)) +
labs(title = "Normalized Density Plots of Scores by Category",
x = "Score",
y = "Density") +
theme_cowplot(12)
suppressWarnings(print(distribution_score_plot))
Plot the filled density of the categories by score
density_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) +
geom_density(position = "fill", alpha = 0.5) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(limits = c(-3, 4)) +
labs(title = "Stacked Density of Scores by Category",
x = "Score",
y = "Fraction") +
theme_cowplot() +
scale_fill_manual(values = group_colors)
# Print the plot
suppressWarnings(print(density_score_plot))
Plot the ROCs
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
#Full model
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("Full model AUC:")
## [1] "Full model AUC:"
auc(roc_result_full)
## Area under the curve: 0.7638
#common variants
roc_result_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("commonvariant model AUC:")
## [1] "commonvariant model AUC:"
auc(roc_result_common)
## Area under the curve: 0.682
#coding variants
roc_result_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$coding_rare_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_coding)
## Area under the curve: 0.7008
#cardiac and common variants
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_cardiac_and_common)
## Area under the curve: 0.7364
#common and coding variants
roc_result_common_and_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_and_coding_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
paste("common and coding variant model AUC:")
## [1] "common and coding variant model AUC:"
auc(roc_result_common_and_coding)
## Area under the curve: 0.7449
Plot the replication ROC and Precision-recall
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_replication_data$binary_outcome <- ifelse(categorized_replication_data$Category == "Control", 0, 1)
roc_result_full_replication <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_replication)
## Area under the curve: 0.874
Figure out the ROC improvement with each variant class and plot it
#compute the ones not plotted
roc_result_cardiac <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)
# Base AUC for random chance
auc_random_chance <- 0.5
# Create a data frame for plotting
percentage_changes_df <- data.frame(
Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
PercentageChange = c(
(auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac - auc_random_chance) / auc_random_chance * 100,
(auc_common - auc_random_chance) / auc_random_chance * 100,
(auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
(auc_coding - auc_random_chance) / auc_random_chance * 100,
(auc_full - auc_random_chance) / auc_random_chance * 100
)
)
auc_df <- data.frame(
Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
auc = c(
(auc_epilepsy),
(auc_cardiac),
(auc_common),
(auc_epilepsy_and_common),
(auc_cardiac_and_common),
(auc_common_and_coding),
(auc_coding),
(auc_full)
)
)
# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
"Epilepsy","Cardiac","Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))
# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_minimal() +
labs(title = "Percentage Change Over Random Chance AUC",
y = "Percentage Change (%)",
x = "") +
theme_cowplot(12)
print(auc_performance_plot)
# Calculate p-values using DeLong's test
p_values <- list(
"Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
"CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
"Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
"Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
"Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
"CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
"Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
"CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
"Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
"Full vs Common and Coding" = roc.test(roc_result_common_and_coding, roc_result_full)$p.value
)
# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.03714183
##
## $`CMAR vs Common`
## [1] 0.8236934
##
## $`Epilepsy vs Common`
## [1] 0.05250968
##
## $`Coding vs CMAR`
## [1] 0.1295995
##
## $`Coding vs Epilepsy`
## [1] 4.40734e-05
##
## $`CMAR and Common vs Common`
## [1] 3.718671e-06
##
## $`Common vs Coding`
## [1] 0.336048
##
## $`CMAR and common vs Common and Coding`
## [1] 0.06421658
##
## $`Common vs Common and Coding`
## [1] 2.86077e-07
##
## $`Full vs Common and Coding`
## [1] 0.004615593
Compute the deviance attributable to each class
# Perform ANOVA on the model
null_model <- glm(Category ~ 1, data = categorized_combined_data, family = binomial())
# Perform ANOVA on the full model
anova_results <- anova(model, test = "Chisq")
# Calculate the null deviance and the full model deviance
null_deviance <- deviance(null_model)
full_deviance <- deviance(model)
# Calculate the total sum of squares (using total deviance)
tss <- null_deviance - full_deviance
# Calculate the proportion of deviance explained by each predictor
anova_results$Proportion_Variance_Explained <- anova_results$`Deviance` / tss
# Display the results
anova_results
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Category
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 992 1370.0
## Normalized_Heart_rate 1 8.094 991 1361.9 0.00444
## Normalized_PR_interval 1 6.193 990 1355.7 0.01282
## Normalized_QTc 1 5.429 989 1350.3 0.01980
## Normalized_BP 1 21.492 988 1328.8 0.00000
## Normalized_QRS 1 32.145 987 1296.6 0.00000
## Normalized_LVEF 1 11.611 986 1285.0 0.00066
## Normalized_LVESV 1 2.819 985 1282.2 0.09316
## Normalized_SVI 1 4.344 984 1277.8 0.03715
## Normalized_Afib 1 0.736 983 1277.1 0.39103
## Normalized_LVEDVI 1 6.693 982 1270.4 0.00968
## Normalized_SV 1 1.424 981 1269.0 0.23282
## Epilepsy_rare 1 7.220 980 1261.8 0.00721
## Epilepsy_ultrarare 1 11.748 979 1250.0 0.00061
## Cardiomyopathy_rare 1 6.754 978 1243.3 0.00935
## Cardiomyopathy_ultrarare 1 3.595 977 1239.7 0.05795
## PLP_Epilepsy 1 0.009 976 1239.7 0.92564
## PLP_Cardiomyopathy 1 46.892 975 1192.8 0.00000
## Cardiomyopathy_noncoding_rare 1 3.390 974 1189.4 0.06561
## Epilepsy_noncoding_rare 1 35.430 973 1154.0 0.00000
## Cardiomyopathy_null 1 16.730 972 1137.2 0.00004
## Epilepsy_null 1 4.795 971 1132.4 0.02855
## Proportion_Variance_Explained
## NULL
## Normalized_Heart_rate 0.034075
## Normalized_PR_interval 0.026072
## Normalized_QTc 0.022857
## Normalized_BP 0.090478
## Normalized_QRS 0.135322
## Normalized_LVEF 0.048878
## Normalized_LVESV 0.011867
## Normalized_SVI 0.018286
## Normalized_Afib 0.003097
## Normalized_LVEDVI 0.028176
## Normalized_SV 0.005993
## Epilepsy_rare 0.030394
## Epilepsy_ultrarare 0.049455
## Cardiomyopathy_rare 0.028434
## Cardiomyopathy_ultrarare 0.015135
## PLP_Epilepsy 0.000037
## PLP_Cardiomyopathy 0.197408
## Cardiomyopathy_noncoding_rare 0.014270
## Epilepsy_noncoding_rare 0.149152
## Cardiomyopathy_null 0.070428
## Epilepsy_null 0.020185
REDO the “improvement” testing in the replication cohort
# Compute ROC curves for all models
roc_result_cardiac <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common_and_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_and_coding_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$coding_rare_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_full <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)
# Base AUC for random chance
auc_random_chance <- 0.5
# Create a data frame for plotting
percentage_changes_df <- data.frame(
Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
PercentageChange = c(
(auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac - auc_random_chance) / auc_random_chance * 100,
(auc_common - auc_random_chance) / auc_random_chance * 100,
(auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
(auc_coding - auc_random_chance) / auc_random_chance * 100,
(auc_full - auc_random_chance) / auc_random_chance * 100
)
)
auc_df <- data.frame(
Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
auc = c(
auc_epilepsy,
auc_cardiac,
auc_common,
auc_epilepsy_and_common,
auc_cardiac_and_common,
auc_common_and_coding,
auc_coding,
auc_full
)
)
# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
"Epilepsy", "Cardiac", "Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))
# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_minimal() +
labs(title = "Percentage Change Over Random Chance AUC",
y = "Percentage Change (%)",
x = "") +
theme_cowplot(12)
print(auc_performance_plot)
# Calculate p-values using DeLong's test
p_values <- list(
"Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
"CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
"Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
"Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
"Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
"CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
"Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
"CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
"Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
"Full vs CMAR" = roc.test(roc_result_cardiac, roc_result_full)$p.value
)
# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.05440609
##
## $`CMAR vs Common`
## [1] 0.0005775265
##
## $`Epilepsy vs Common`
## [1] 0.1143759
##
## $`Coding vs CMAR`
## [1] 0.7027383
##
## $`Coding vs Epilepsy`
## [1] 0.003616045
##
## $`CMAR and Common vs Common`
## [1] 0.004465682
##
## $`Common vs Coding`
## [1] 0.001003417
##
## $`CMAR and common vs Common and Coding`
## [1] 0.1669517
##
## $`Common vs Common and Coding`
## [1] 0.0009395074
##
## $`Full vs CMAR`
## [1] 0.04508866
Lets see how common and rare cardiac vars interact
# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("1" = "blue", "2-3" = "green", "4-6" = "red")) +
labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
theme_cowplot(12) +
theme(strip.text.x = element_text(size = 10))
# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
geom_point(alpha = 1) +
scale_color_manual(values = group_colors) +
labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
theme_cowplot(12) +
theme(strip.text.x = element_text(size = 10))
dot_plot
median_cardiac <- median(categorized_combined_data$rank_cardiac, na.rm = TRUE)
median_common <- median(categorized_combined_data$rank_common, na.rm = TRUE)
# Filter for the top half
quadrant1 <- categorized_combined_data %>%
filter(rank_cardiac <= median_cardiac & rank_common <= median_common)
quadrant2 <- categorized_combined_data %>%
filter(rank_cardiac < median_cardiac & rank_common > median_common)
quadrant3 <- categorized_combined_data %>%
filter(rank_cardiac > median_cardiac & rank_common < median_common)
quadrant4 <- categorized_combined_data %>%
filter(rank_cardiac >= median_cardiac & rank_common >= median_common)
# Combine into one data frame
combined_quadrants <- bind_rows(
quadrant1 %>% mutate(Quadrant = 'Quadrant 1'),
quadrant2 %>% mutate(Quadrant = 'Quadrant 2'),
quadrant3 %>% mutate(Quadrant = 'Quadrant 3'),
quadrant4 %>% mutate(Quadrant = 'Quadrant 4')
)
# Calculate the count of individuals in each quadrant for each category
# and the total count of individuals in each category
percentage_by_category <- combined_quadrants %>%
group_by(Category, Quadrant) %>%
summarise(Count = n(), .groups = 'drop') %>%
group_by(Category) %>%
mutate(Total = sum(Count), Percentage = Count / Total * 100)
# View the result
percentage_by_category
## # A tibble: 8 × 5
## # Groups: Category [2]
## Category Quadrant Count Total Percentage
## <fct> <chr> <int> <int> <dbl>
## 1 Control Quadrant 1 235 537 43.8
## 2 Control Quadrant 2 107 537 19.9
## 3 Control Quadrant 3 98 537 18.2
## 4 Control Quadrant 4 97 537 18.1
## 5 zCase Quadrant 1 79 456 17.3
## 6 zCase Quadrant 2 75 456 16.4
## 7 zCase Quadrant 3 84 456 18.4
## 8 zCase Quadrant 4 218 456 47.8
# Calculate overall proportions for each quadrant
overall_counts <- combined_quadrants %>%
count(Quadrant) %>%
mutate(OverallProportion = n / sum(n))
# Calculate total counts by category for scaling expected values
category_totals <- combined_quadrants %>%
count(Category) %>%
dplyr::rename(TotalInCategory = n)
# Calculate expected counts
expected_counts <- combined_quadrants %>%
dplyr::select(Category, Quadrant) %>%
distinct() %>%
left_join(overall_counts, by = "Quadrant") %>%
left_join(category_totals, by = "Category") %>%
mutate(Expected = OverallProportion * TotalInCategory)
# Calculate observed counts
observed_counts <- combined_quadrants %>%
count(Category, Quadrant) %>%
dplyr::rename(Observed = n)
# combine
combined_for_plotting <- combined_quadrants %>%
count(Category, Quadrant) %>%
dplyr::rename(Observed = n) %>%
left_join(overall_counts, by = "Quadrant") %>%
left_join(category_totals, by = "Category") %>%
mutate(Expected = OverallProportion * TotalInCategory, Ratio = Observed / Expected)
# Calculate the Observed/Expected ratio
combined_for_plotting <- combined_for_plotting %>%
mutate(Ratio = Observed / Expected)
# Plot the Observed/Expected ratio
quadrant_plot <- ggplot(combined_for_plotting, aes(x = Quadrant, y = Ratio, fill = Category)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(y = "Observed/Expected Ratio", x = "Quadrant", title = "Observed/Expected Ratios by Category and Quadrant") +
theme_cowplot(12) +
scale_fill_manual(values = group_colors) +
geom_hline(yintercept = 1, linetype = "dashed", color = "Black")
quadrant_plot
# Create the contingency table
contingency_table <- xtabs(Observed ~ Category + Quadrant, data = observed_counts)
# Print the contingency table
print(contingency_table)
## Quadrant
## Category Quadrant 1 Quadrant 2 Quadrant 3 Quadrant 4
## Control 235 107 98 97
## zCase 79 75 84 218
# Chi-squared test
chi_squared_result <- chisq.test(contingency_table)
# Print the results of the chi-squared test
print(chi_squared_result)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 124.91, df = 3, p-value < 2.2e-16
Filter high cardiac-score individuals, visualize their cardiac vs. common variant scores, and assess interactions
# Filter the data for cardiac_score_quintiles > 3
top_2_quintiles <- categorized_combined_data %>%
filter(cardiac_score_quintiles > 3)
common_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = common_variant_score)) +
geom_point(aes(color = Category), alpha = 0.5) +
geom_smooth(method = "lm", aes(group = Category, color = Category)) +
labs(x = "Cardiac Variant Score", y = "Common Variant Score", title = "Cardiac Variant Score vs. Common Variant Score by Category") +
scale_color_manual(values = group_colors) +
theme_cowplot(12)
# Highlight the specific point in red
common_density_score_plot <- common_density_score_plot +
geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = common_variant_score), color = "red")
# Print
print(common_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'
#Do the stats
model_common_cardiac <- lm(common_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_common_cardiac)
##
## Call:
## lm(formula = common_variant_score ~ cardiac_variant_score * Category,
## data = top_2_quintiles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8514 -0.3866 0.0569 0.4076 1.5320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.22986 0.05737 -4.006 7.38e-05 ***
## cardiac_variant_score -0.16418 0.09335 -1.759 0.0794 .
## CategoryzCase 0.40285 0.07411 5.436 9.59e-08 ***
## cardiac_variant_score:CategoryzCase 0.16371 0.09635 1.699 0.0901 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6477 on 394 degrees of freedom
## Multiple R-squared: 0.1034, Adjusted R-squared: 0.09659
## F-statistic: 15.15 on 3 and 394 DF, p-value: 2.386e-09
Visualize the relationship between cardiac and epilepsy variant scores
epilepsy_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = epilepsy_variant_score)) +
geom_point(aes(color = Category), alpha = 0.5) +
geom_smooth(method = "lm", aes(group = Category, color = Category)) +
labs(x = "Cardiac Variant Score", y = "Epilepsy Variant Score", title = "Cardiac Variant Score vs. Epilepsy Variant Score by Category") +
scale_color_manual(values = group_colors) +
theme_cowplot(12)
# Highlight the specific point in red
epilepsy_density_score_plot <- epilepsy_density_score_plot +
geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = epilepsy_variant_score), color = "red")
# Print
print(epilepsy_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'
#Do the stats
model_epilepsy_cardiac <- lm(epilepsy_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_epilepsy_cardiac)
##
## Call:
## lm(formula = epilepsy_variant_score ~ cardiac_variant_score *
## Category, data = top_2_quintiles)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8187 -0.4169 -0.0110 0.3811 9.3169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1577 0.0861 -1.832 0.0677 .
## cardiac_variant_score -0.1611 0.1401 -1.150 0.2509
## CategoryzCase -0.2170 0.1112 -1.952 0.0517 .
## cardiac_variant_score:CategoryzCase 0.9352 0.1446 6.468 2.95e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.972 on 394 degrees of freedom
## Multiple R-squared: 0.5589, Adjusted R-squared: 0.5555
## F-statistic: 166.4 on 3 and 394 DF, p-value: < 2.2e-16
Do the stats for the interaction data
# do wilcox_test
CMARR_epilepsy_wilcox_test_results <- wilcox.test(epilepsy_variant_score ~ Category,
data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))
CMARR_epilepsy_wilcox_test_results
##
## Wilcoxon rank sum test with continuity correction
##
## data: epilepsy_variant_score by Category
## W = 11515, p-value = 4.869e-10
## alternative hypothesis: true location shift is not equal to 0
CMARR_common_wilcox_test_results <- wilcox.test(common_variant_score ~ Category,
data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))
CMARR_common_wilcox_test_results
##
## Wilcoxon rank sum test with continuity correction
##
## data: common_variant_score by Category
## W = 11649, p-value = 1.063e-09
## alternative hypothesis: true location shift is not equal to 0
Calculate how well GAPS performs in PLP- individuals
no_PLPs_categorized_data <- categorized_combined_data %>%
filter(PLP_Cardiomyopathy < 1 & probability > 0.83)
# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative DISCOVERY controls over the threshold")
## [1] "number of PLP negative DISCOVERY controls over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative DISCOVERY cases over the threshold")
## [1] "number of PLP negative DISCOVERY cases over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group > 1))
## [1] 22
#for replication
no_PLPs_categorized_replication_data <- categorized_replication_data %>%
filter(PLP_Cardiomyopathy < 1 & probability > 0.7)
# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative REPLICATION controls over the threshold")
## [1] "number of PLP negative REPLICATION controls over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative REPLICATION cases over the threshold")
## [1] "number of PLP negative REPLICATION cases over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group > 1))
## [1] 22
NOW, retrain the model on the validation cohort and apply it to the discovery cohort
model_replication <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_replication_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_replication
##
## Call: glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval +
## Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
## Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI +
## Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare +
## Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy +
## Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
## Cardiomyopathy_null + Epilepsy_null, family = binomial(),
## data = categorized_replication_data)
##
## Coefficients:
## (Intercept) Normalized_Heart_rate
## -10.632909 -0.489645
## Normalized_PR_interval Normalized_QTc
## -0.468604 -0.520174
## Normalized_BP Normalized_QRS
## 0.909731 0.538230
## Normalized_LVEF Normalized_LVESV
## -0.863017 -0.791458
## Normalized_SVI Normalized_Afib
## 0.005339 0.842873
## Normalized_LVEDVI Normalized_SV
## -0.774451 0.390626
## Epilepsy_rare Epilepsy_ultrarare
## -0.127564 -1.765110
## Cardiomyopathy_rare Cardiomyopathy_ultrarare
## -0.560834 1.434416
## PLP_Epilepsy PLP_Cardiomyopathy
## -9.841170 0.909586
## Cardiomyopathy_noncoding_rare Epilepsy_noncoding_rare
## -0.051308 0.361626
## Cardiomyopathy_null Epilepsy_null
## 6.534668 0.974009
##
## Degrees of Freedom: 125 Total (i.e. Null); 104 Residual
## Null Deviance: 174.2
## Residual Deviance: 47.6 AIC: 91.6
scores_replication_model_on_discovery <- predict(model_replication, newdata = categorized_combined_data, type = "link")
# Add scores to the dataframes
categorized_combined_data$scores_replication_model_on_discovery <- scores_replication_model_on_discovery
# z normalize
mean_replication_on_discovery <- mean(categorized_combined_data$scores_replication_model_on_discovery)
sd_replication_on_discovery <- sd(categorized_combined_data$scores_replication_model_on_discovery)
scores_replication_model_on_discovery_plot <- ggplot(categorized_combined_data, aes(x = Category, y = (scores_replication_model_on_discovery - mean_replication_on_discovery)/sd_replication_on_discovery, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_model_on_discovery_plot
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# wilcox.test
wilcox.test_result_replication_on_discovery <- wilcox.test(scores_replication_model_on_discovery ~ Category, data = categorized_combined_data)
# View the results
print(wilcox.test_result_replication_on_discovery)
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores_replication_model_on_discovery by Category
## W = 84150, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Calculate GAPS on betas trained and applied to replication data (within-replication model)
scores_replication_model_on_replication <- predict(model_replication, newdata = categorized_replication_data, type = "link")
# Add scores to the dataframes
categorized_replication_data$scores_replication_model_on_replication <- scores_replication_model_on_replication
# z normalize
mean_replication_on_replication <- mean(categorized_replication_data$scores_replication_model_on_replication)
sd_replication_on_replication <- sd(categorized_replication_data$scores_replication_model_on_replication)
scores_replication_model_on_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication_model_on_replication - mean_replication_on_replication)/sd_replication_on_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_model_on_replication_plot
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# t-test
wilcox.test_replication_on_replication <- wilcox.test(scores_replication_model_on_replication ~ Category, data = categorized_replication_data)
# View the results
print(wilcox.test_replication_on_replication)
##
## Wilcoxon rank sum test with continuity correction
##
## data: scores_replication_model_on_replication by Category
## W = 97, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Now calculate AUC for the original data using replication-derived betas (reverse validation)
categorized_combined_data <- categorized_combined_data %>%
mutate(
probability_replication_on_discovery = plogis(scores_replication_model_on_discovery),
)
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability_replication_on_discovery, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full)
## Area under the curve: 0.6564
# Calculate the confidence interval for the AUC
ci_auc <- ci.auc(roc_result_full)
# Print the confidence interval
print(ci_auc)
## 95% CI: 0.6221-0.6906 (DeLong)
Replication model on discovery data Odds Ratio
categorized_combined_data <- categorized_combined_data %>%
mutate(
replication_model_on_discovery_rank = rank(scores_replication_model_on_discovery, ties.method = "first"),
replication_model_on_discovery_score_quintiles = ceiling(replication_model_on_discovery_rank / (n() / 5)),
)
# Apply function to each odds category
replication_model_on_discovery_combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, replication_model_on_discovery_score_quintiles)
plot(log2(replication_model_on_discovery_combined_odds_summary$odds_ratio),
ylim = c(-2,10
),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
xaxt = "n", col = "#3C8C78")
lines(1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), col = "#3C8C78")
# Add error bars for 95% CI - Combined
arrows(x0 = 1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio),
y0 = log2(replication_model_on_discovery_combined_odds_summary$lower_ci),
y1 = log2(replication_model_on_discovery_combined_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
ANCESTRY
Determine optimal PCs where R2 is not dropping and when RMSE does not rise
# Filter the data frame to include only rows where Category == 1
filtered_categorized_combined_data <- subset(categorized_combined_data, Category == "Control")
# Define the number of PCs to try
num_pcs <- 1:20
# Initialize a list to store model results
cv_results <- data.frame(PCs = num_pcs, R2 = NA, RMSE = NA)
# Perform cross-validation for each number of PCs
for (i in num_pcs) {
formula <- as.formula(paste0("scores ~ ", paste0("PC", 1:i, collapse = " + ")))
model <- train(formula, data = filtered_categorized_combined_data, method = "lm",
trControl = trainControl(method = "cv", number = 10))
# Store R² and RMSE for each model
cv_results$R2[i] <- model$results$Rsquared
cv_results$RMSE[i] <- model$results$RMSE
}
# View the results to choose the optimal number of PCs
print(cv_results)
## PCs R2 RMSE
## 1 1 0.0876854 0.8548791
## 2 2 0.1764262 0.8139189
## 3 3 0.1719754 0.8115201
## 4 4 0.2051458 0.7992826
## 5 5 0.1893706 0.8050483
## 6 6 0.1899893 0.8024347
## 7 7 0.1832296 0.8094571
## 8 8 0.1848796 0.8047957
## 9 9 0.1892157 0.8105372
## 10 10 0.1952686 0.8121673
## 11 11 0.1768508 0.8110399
## 12 12 0.1843039 0.8099173
## 13 13 0.1619256 0.8209299
## 14 14 0.1594518 0.8230262
## 15 15 0.1651181 0.8227623
## 16 16 0.1615417 0.8223863
## 17 17 0.1533038 0.8293044
## 18 18 0.1470195 0.8273820
## 19 19 0.1532944 0.8281546
## 20 20 0.1556271 0.8263788
ggplot(cv_results, aes(x = PCs)) +
geom_line(aes(y = R2), color = "blue") +
geom_line(aes(y = RMSE), color = "red") +
ylab("Model Performance (R² / RMSE)") +
ggtitle("Cross-Validation of PCs") +
theme_minimal()
Regress out the influence of the ancestry PCS
# Run the regression of scores on C1 to C10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Calculate the adjusted_scores by removing the effects of C1 to C10
categorized_combined_data$adjusted_scores <- categorized_combined_data$scores -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
Get adjusted probabilities
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_combined_data <- categorized_combined_data %>%
mutate(adjusted_probability = plogis(adjusted_scores))
Perform PC clustering
# Extract PCA columns (PC1 to PC20)
pca_columns <- categorized_combined_data[, grep("^PC\\d+$", names(categorized_combined_data))]
# Perform GMM clustering
gmm_result <- Mclust(pca_columns, G = 4)
# Extract the cluster labels from the GMM result
initial_clusters <- gmm_result$classification
# Select the data points that belong to Cluster 1
cluster1_data <- pca_columns[initial_clusters == 1, ]
# Perform GMM sub-clustering on Cluster 1
sub_gmm_result <- Mclust(cluster1_data, G = 2)
# Integrate the sub-clustering results back into your overall clustering
final_clusters <- initial_clusters
# Assign new cluster labels to the points in Cluster 1 based on the sub-clustering
# We use +4 here to differentiate from the initial clusters (assuming there were 4 clusters initially)
final_clusters[initial_clusters == 1] <- sub_gmm_result$classification + 4
# Select the data points that belong to Cluster 2
cluster2_data <- pca_columns[initial_clusters == 2, ]
# Perform GMM sub-clustering on Cluster 2
sub_gmm_result2 <- Mclust(cluster2_data, G = 2)
# Integrate the sub-clustering results back into your overall clustering
final_clusters[initial_clusters == 2] <- sub_gmm_result2$classification + 6
Add cluster assignments to data
# Add the final cluster assignments to the original dataframe
categorized_combined_data$cluster_gmm <- as.factor(final_clusters)
# Visualize the clustering results
ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm)) +
geom_point() +
ggtitle("GMM Clustering of PCA Data") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(final_clusters)))) +
theme(legend.position = "bottom")
contingency_table <- table(categorized_combined_data$cluster_gmm, categorized_combined_data$Ancestry)
contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("Cluster", "Ancestry", "Count")
# Bar plot to show the makeup of each cluster by ancestry
ggplot(contingency_df, aes(x = Cluster, y = Count, fill = Ancestry)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Makeup of Each Cluster by Ancestry") +
xlab("Cluster") + ylab("Count") +
theme_minimal() +
theme(legend.position = "bottom")
Plot the PCs, ancestry clusters informed by self-reported identities, and plot adjusted data
# Define the mapping of clusters to ancestry
cluster_to_ancestry <- c("6" = "EUR", "7" = "HIS", "5" = "AFR", "3" = "OTH", "4" = "OTH", "8" = "HIS")
# Add the new column to the dataframe
categorized_combined_data$cluster_gmm_Ancestry <- as.character(categorized_combined_data$cluster_gmm)
# Reassign the clusters based on the mapping
categorized_combined_data$cluster_gmm_Ancestry <- sapply(categorized_combined_data$cluster_gmm, function(x) cluster_to_ancestry[as.character(x)])
# Convert the new column to a factor
categorized_combined_data$cluster_gmm_Ancestry <- factor(categorized_combined_data$cluster_gmm_Ancestry, levels = unique(cluster_to_ancestry))
# Create a summary dataframe with counts
summary_data <- categorized_combined_data %>%
group_by(Category, cluster_gmm_Ancestry) %>%
summarise(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
# Define the palette
palette_colors <- wes_palette("FantasticFox1", length(unique(categorized_combined_data$cluster_gmm_Ancestry)), type = "continuous")
# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(categorized_combined_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
xlab("Category") +
ylab("Percentage") +
theme(legend.position = "bottom") +
scale_fill_manual(values = palette_colors) +
geom_text(data = summary_data, aes(label = count, y = count / sum(count), group = cluster_gmm_Ancestry),
position = position_fill(vjust = 0.5), color = "black") +
theme_cowplot(12)
# Visualize the clustering results
clusters <- ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm_Ancestry)) +
geom_point() +
ggtitle("GMM Clustering of PCA Data") +
theme_minimal() +
scale_color_manual(values = palette_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12)
# Generalized Plot for Scores
ancestry_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Generalized Plot for Adjusted Scores
ancestry_adjusted_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = adjusted_scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Adjusted Scores by Category (not ancestry-specific)
ancestry_adjusted_total_scores <- ggplot(categorized_combined_data, aes(x = Category, y = adjusted_scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by Category",
x = "Category",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Cardiac Variant Scores
cardiac_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = cardiac_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Cardiac Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Cardiac Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Epilepsy Variant Scores
epilepsy_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = epilepsy_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Epilepsy Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Epilepsy Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Common Variant Scores
common_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = common_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Common Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Common Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Specific Plot for Noncoding Variant Scores
noncoding_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = noncoding_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Noncoding Variant Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Noncoding Variant Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
# Print Plots
clusters
ancestry_makeup
suppressWarnings(print(ancestry_scores))
suppressWarnings(print(ancestry_adjusted_scores))
suppressWarnings(print(ancestry_adjusted_total_scores))
suppressWarnings(print(cardiac_variant_scores))
suppressWarnings(print(epilepsy_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
suppressWarnings(print(common_variant_scores))
suppressWarnings(print(noncoding_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
# Wilcoxon Test for Adjusted Scores
wilcox.test(adjusted_scores ~ Category, data = categorized_combined_data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: adjusted_scores by Category
## W = 79303, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# Wilcoxon Test within each ancestry group (Adjusted Scores)
adjusted_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(adjusted_scores ~ Category)$p.value
)
print(adjusted_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 0.00000000433
## 2 HIS 0.0181
## 3 AFR 0.0000162
## 4 OTH 0.00548
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(scores ~ Category)$p.value
)
print(scores_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 1.75e-12
## 2 HIS 2.43e- 3
## 3 AFR 1.17e- 7
## 4 OTH 1.32e- 3
Interaction model to evaluate ancestry x PGS interactions
# Interaction model: Does ancestry modify the effect of the variables on case/control?
interaction_model <- glm(Category ~ Normalized_LVEF * cluster_gmm_Ancestry +
Normalized_Heart_rate * cluster_gmm_Ancestry +
Normalized_SV * cluster_gmm_Ancestry +
Normalized_LVEDVI * cluster_gmm_Ancestry +
Normalized_Afib * cluster_gmm_Ancestry +
Normalized_PR_interval * cluster_gmm_Ancestry +
Normalized_LVESV * cluster_gmm_Ancestry +
Normalized_SVI * cluster_gmm_Ancestry +
Normalized_BP * cluster_gmm_Ancestry +
Normalized_QTc * cluster_gmm_Ancestry,
data = categorized_combined_data, family = binomial)
summary(interaction_model)
##
## Call:
## glm(formula = Category ~ Normalized_LVEF * cluster_gmm_Ancestry +
## Normalized_Heart_rate * cluster_gmm_Ancestry + Normalized_SV *
## cluster_gmm_Ancestry + Normalized_LVEDVI * cluster_gmm_Ancestry +
## Normalized_Afib * cluster_gmm_Ancestry + Normalized_PR_interval *
## cluster_gmm_Ancestry + Normalized_LVESV * cluster_gmm_Ancestry +
## Normalized_SVI * cluster_gmm_Ancestry + Normalized_BP * cluster_gmm_Ancestry +
## Normalized_QTc * cluster_gmm_Ancestry, family = binomial,
## data = categorized_combined_data)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.985996 0.189985 5.190
## Normalized_LVEF -0.056041 0.160970 -0.348
## cluster_gmm_AncestryHIS -2.618199 0.270454 -9.681
## cluster_gmm_AncestryAFR -1.501416 0.291432 -5.152
## cluster_gmm_AncestryOTH 0.085666 0.347552 0.246
## Normalized_Heart_rate -0.148686 0.127560 -1.166
## Normalized_SV -0.129167 0.144625 -0.893
## Normalized_LVEDVI 0.099653 0.166314 0.599
## Normalized_Afib -0.119832 0.128547 -0.932
## Normalized_PR_interval 0.233999 0.117628 1.989
## Normalized_LVESV 0.006579 0.184338 0.036
## Normalized_SVI -0.007414 0.176301 -0.042
## Normalized_BP 0.100302 0.128746 0.779
## Normalized_QTc 0.144560 0.129390 1.117
## Normalized_LVEF:cluster_gmm_AncestryHIS 0.161473 0.251740 0.641
## Normalized_LVEF:cluster_gmm_AncestryAFR -0.057026 0.243657 -0.234
## Normalized_LVEF:cluster_gmm_AncestryOTH 0.149925 0.379303 0.395
## cluster_gmm_AncestryHIS:Normalized_Heart_rate 0.279794 0.205353 1.363
## cluster_gmm_AncestryAFR:Normalized_Heart_rate 0.248188 0.199291 1.245
## cluster_gmm_AncestryOTH:Normalized_Heart_rate 0.021320 0.282468 0.075
## cluster_gmm_AncestryHIS:Normalized_SV 0.018127 0.241476 0.075
## cluster_gmm_AncestryAFR:Normalized_SV -0.005352 0.227930 -0.023
## cluster_gmm_AncestryOTH:Normalized_SV -0.099748 0.367285 -0.272
## cluster_gmm_AncestryHIS:Normalized_LVEDVI -0.059796 0.277594 -0.215
## cluster_gmm_AncestryAFR:Normalized_LVEDVI -0.012314 0.242344 -0.051
## cluster_gmm_AncestryOTH:Normalized_LVEDVI 0.162975 0.409278 0.398
## cluster_gmm_AncestryHIS:Normalized_Afib -0.063225 0.207940 -0.304
## cluster_gmm_AncestryAFR:Normalized_Afib 0.092741 0.189536 0.489
## cluster_gmm_AncestryOTH:Normalized_Afib 0.318674 0.279610 1.140
## cluster_gmm_AncestryHIS:Normalized_PR_interval -0.107243 0.195593 -0.548
## cluster_gmm_AncestryAFR:Normalized_PR_interval -0.006562 0.185089 -0.035
## cluster_gmm_AncestryOTH:Normalized_PR_interval 0.140872 0.354449 0.397
## cluster_gmm_AncestryHIS:Normalized_LVESV 0.611566 0.302932 2.019
## cluster_gmm_AncestryAFR:Normalized_LVESV 0.002846 0.265806 0.011
## cluster_gmm_AncestryOTH:Normalized_LVESV 0.084992 0.343738 0.247
## cluster_gmm_AncestryHIS:Normalized_SVI -0.031809 0.270997 -0.117
## cluster_gmm_AncestryAFR:Normalized_SVI 0.088692 0.237651 0.373
## cluster_gmm_AncestryOTH:Normalized_SVI 0.064705 0.353658 0.183
## cluster_gmm_AncestryHIS:Normalized_BP 0.081335 0.195361 0.416
## cluster_gmm_AncestryAFR:Normalized_BP 0.286601 0.198513 1.444
## cluster_gmm_AncestryOTH:Normalized_BP -0.185752 0.315792 -0.588
## cluster_gmm_AncestryHIS:Normalized_QTc -0.256980 0.198895 -1.292
## cluster_gmm_AncestryAFR:Normalized_QTc -0.261544 0.239132 -1.094
## cluster_gmm_AncestryOTH:Normalized_QTc -0.395822 0.295921 -1.338
## Pr(>|z|)
## (Intercept) 2.10e-07 ***
## Normalized_LVEF 0.7277
## cluster_gmm_AncestryHIS < 2e-16 ***
## cluster_gmm_AncestryAFR 2.58e-07 ***
## cluster_gmm_AncestryOTH 0.8053
## Normalized_Heart_rate 0.2438
## Normalized_SV 0.3718
## Normalized_LVEDVI 0.5490
## Normalized_Afib 0.3512
## Normalized_PR_interval 0.0467 *
## Normalized_LVESV 0.9715
## Normalized_SVI 0.9665
## Normalized_BP 0.4359
## Normalized_QTc 0.2639
## Normalized_LVEF:cluster_gmm_AncestryHIS 0.5212
## Normalized_LVEF:cluster_gmm_AncestryAFR 0.8150
## Normalized_LVEF:cluster_gmm_AncestryOTH 0.6926
## cluster_gmm_AncestryHIS:Normalized_Heart_rate 0.1730
## cluster_gmm_AncestryAFR:Normalized_Heart_rate 0.2130
## cluster_gmm_AncestryOTH:Normalized_Heart_rate 0.9398
## cluster_gmm_AncestryHIS:Normalized_SV 0.9402
## cluster_gmm_AncestryAFR:Normalized_SV 0.9813
## cluster_gmm_AncestryOTH:Normalized_SV 0.7859
## cluster_gmm_AncestryHIS:Normalized_LVEDVI 0.8295
## cluster_gmm_AncestryAFR:Normalized_LVEDVI 0.9595
## cluster_gmm_AncestryOTH:Normalized_LVEDVI 0.6905
## cluster_gmm_AncestryHIS:Normalized_Afib 0.7611
## cluster_gmm_AncestryAFR:Normalized_Afib 0.6246
## cluster_gmm_AncestryOTH:Normalized_Afib 0.2544
## cluster_gmm_AncestryHIS:Normalized_PR_interval 0.5835
## cluster_gmm_AncestryAFR:Normalized_PR_interval 0.9717
## cluster_gmm_AncestryOTH:Normalized_PR_interval 0.6910
## cluster_gmm_AncestryHIS:Normalized_LVESV 0.0435 *
## cluster_gmm_AncestryAFR:Normalized_LVESV 0.9915
## cluster_gmm_AncestryOTH:Normalized_LVESV 0.8047
## cluster_gmm_AncestryHIS:Normalized_SVI 0.9066
## cluster_gmm_AncestryAFR:Normalized_SVI 0.7090
## cluster_gmm_AncestryOTH:Normalized_SVI 0.8548
## cluster_gmm_AncestryHIS:Normalized_BP 0.6772
## cluster_gmm_AncestryAFR:Normalized_BP 0.1488
## cluster_gmm_AncestryOTH:Normalized_BP 0.5564
## cluster_gmm_AncestryHIS:Normalized_QTc 0.1963
## cluster_gmm_AncestryAFR:Normalized_QTc 0.2741
## cluster_gmm_AncestryOTH:Normalized_QTc 0.1810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1370.0 on 992 degrees of freedom
## Residual deviance: 1091.5 on 949 degrees of freedom
## AIC: 1179.5
##
## Number of Fisher Scoring iterations: 4
Now redo the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS
# List of Scores to Correct
score_list <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV",
"Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV",
"Normalized_Heart_rate", "Normalized_LVEF", "Epilepsy_rare", "Epilepsy_ultrarare",
"Cardiomyopathy_rare", "Cardiomyopathy_ultrarare", "PLP_Epilepsy", "PLP_Cardiomyopathy",
"Cardiomyopathy_noncoding_rare", "Epilepsy_noncoding_rare", "Cardiomyopathy_null", "Epilepsy_null")
# Adjust Each Score in the Training Data
for (score in score_list) {
# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")),
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Adjust the score by removing ancestry effects
categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}
# Fit Logistic Regression Using Adjusted Scores
adjusted_score_list <- paste0(score_list, "_ANCESTRY_adjusted")
model_ANCESTRY_adjusted <- glm(as.formula(paste("Category ~", paste(adjusted_score_list, collapse = " + "))),
data = categorized_combined_data, family = binomial())
# Predict Scores for Training Data
categorized_combined_data$Scores_ANCESTRY_adjusted <- predict(model_ANCESTRY_adjusted, type = "link")
# Plot Adjusted Scores
ancestry_ADJUSTED_pre_by_category <- ggplot(categorized_combined_data, aes(x = Category, y = Scores_ANCESTRY_adjusted, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by Category", x = "Category", y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)
ancestry_ADJUSTED_pre_by_category
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ancestry_ADJUSTED_pre_by_ancestry <- ggplot(categorized_combined_data,
aes(x = cluster_gmm_Ancestry,
y = Scores_ANCESTRY_adjusted,
fill = Category,
pattern = Category)) +
geom_boxplot_pattern(outlier.shape = NA,
notch = TRUE,
pattern_density = 0.1,
pattern_fill = "black",
pattern_spacing = 0.01,
pattern_angle = 90) +
labs(title = "Adjusted Scores by Ancestry",
x = "Ancestry Cluster",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
scale_pattern_manual(values = c("stripe", "crosshatch", "dot", "none")) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
ancestry_ADJUSTED_pre_by_ancestry
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Compute ROC and AUC for Training Data
categorized_combined_data <- categorized_combined_data %>%
mutate(adjusted_probability = plogis(Scores_ANCESTRY_adjusted))
roc_result_full_PRE_ADJUSTED <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("Training AUC:", auc(roc_result_full_PRE_ADJUSTED)))
## [1] "Training AUC: 0.758878107746088"
# Adjust Each Score in the Replication Data
for (score in score_list) {
# Run ancestry adjustment regression in the training set
ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")),
data = filtered_categorized_combined_data)
# Get the ancestry correction coefficients
ancestry_coefficients <- coef(ancestry_regression_model)
# Apply ancestry adjustment to the replication dataset
categorized_replication_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_replication_data[[score]] -
(ancestry_coefficients["(Intercept)"] +
categorized_replication_data$PC1 * ancestry_coefficients["PC1"] +
categorized_replication_data$PC2 * ancestry_coefficients["PC2"] +
categorized_replication_data$PC3 * ancestry_coefficients["PC3"] +
categorized_replication_data$PC4 * ancestry_coefficients["PC4"] +
categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
categorized_replication_data$PC10 * ancestry_coefficients["PC10"])
}
# Predict Using the Ancestry-Adjusted Replication Data
adjusted_score_list_replication <- paste0(score_list, "_ANCESTRY_adjusted")
# Predict using the model, ensuring we use ONLY adjusted scores
categorized_replication_data$Scores_ANCESTRY_adjusted_replication <- predict(model_ANCESTRY_adjusted,
newdata = categorized_replication_data[, adjusted_score_list_replication, drop = FALSE],
type = "link")
# Convert to Probabilities
categorized_replication_data <- categorized_replication_data %>%
mutate(adjusted_probability_replication = plogis(Scores_ANCESTRY_adjusted_replication))
# Compute ROC and AUC for Replication Data
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome,
categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("Replication AUC:", auc(roc_result_full_adjusted)))
## [1] "Replication AUC: 0.874778649127245"
plot(roc_result_full_adjusted, xlim = c(1, 0))
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(Scores_ANCESTRY_adjusted ~ Category)$p.value
)
print(scores_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 6.46e-16
## 2 HIS 5.57e- 6
## 3 AFR 8.54e- 9
## 4 OTH 1.07e- 4
Now plot the datausing the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS
# List of Common Variant Scores to Correct
common_variant_scores <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV",
"Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV",
"Normalized_Heart_rate", "Normalized_LVEF")
# Adjust Each Score in the Training Data
for (score in common_variant_scores) {
# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")),
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Adjust the score by removing ancestry effects
categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}
# Fit Logistic Regression Using Adjusted Scores
adjusted_common_variant_scores <- paste0(common_variant_scores, "_ANCESTRY_adjusted")
model_common_variant_ADJUSTED_pre <- glm(as.formula(paste("Category ~", paste(adjusted_common_variant_scores, collapse = " + "))),
data = categorized_combined_data, family = binomial())
# Predict Common Variant Score
categorized_combined_data$Common_Variant_Score_ADJUSTED_pre <- predict(model_common_variant_ADJUSTED_pre, type = "link")
# Plot Common Variant Score by Category
common_variant_plot_by_category_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = Category, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Common Variant Score by Category", x = "Category", y = "Common Variant Score") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)
common_variant_plot_by_category_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
# Plot Common Variant Score by Ancestry
common_variant_plot_by_ancestry_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Common Variant Score by Ancestry", x = "Ancestry Cluster", y = "Common Variant Score") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)
common_variant_plot_by_ancestry_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
summarise(
p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
)
print(scores_wilcox_results)
## p_value
## 1 5.584507e-20
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
group_by(cluster_gmm_Ancestry) %>%
summarise(
p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
)
print(scores_wilcox_results)
## # A tibble: 4 × 2
## cluster_gmm_Ancestry p_value
## <fct> <dbl>
## 1 EUR 0.000000680
## 2 HIS 0.0000165
## 3 AFR 0.000238
## 4 OTH 0.0857
Compute the odds ratios in the highest quintile across ancestry before and after the post-integration adjustments
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank_adjusted = rank(adjusted_scores, ties.method = "first"),
score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
)
#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]
# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_quintiles)
combined_odds_summary
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 162 36 0.222 1
## 2 2 138 61 0.442 1.99
## 3 3 116 82 0.707 3.18
## 4 4 89 110 1.24 5.56
## 5 5 32 167 5.22 23.5
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 82 21 0.256 1
## 2 2 56 18 0.321 1.26
## 3 3 38 21 0.553 2.16
## 4 4 13 13 1 3.90
## 5 5 4 17 4.25 16.6
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 69 10 0.145 1
## 2 2 56 9 0.161 1.11
## 3 3 41 13 0.317 2.19
## 4 4 49 17 0.347 2.39
## 5 5 15 9 0.6 4.14
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 8 2 0.25 1
## 2 2 19 24 1.26 5.05
## 3 3 29 39 1.34 5.38
## 4 4 23 69 3 12
## 5 5 11 112 10.2 40.7
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH
## # A tibble: 5 × 8
## score_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 3 3 1 1
## 2 2 7 10 1.43 1.43
## 3 3 8 9 1.12 1.12
## 4 4 4 11 2.75 2.75
## 5 5 2 29 14.5 14.5
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Apply the OR funciton you made way earlier
combined_odds_summary_adjust= calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR_adjust = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS_adjust = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR_adjust = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH_adjust = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)
combined_odds_summary_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 134 64 0.478 1
## 2 2 131 68 0.519 1.09
## 3 3 118 80 0.678 1.42
## 4 4 105 94 0.895 1.87
## 5 5 49 150 3.06 6.41
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 52 16 0.308 1
## 2 2 40 11 0.275 0.894
## 3 3 45 16 0.356 1.16
## 4 4 45 25 0.556 1.81
## 5 5 11 22 2 6.5
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 56 8 0.143 1
## 2 2 57 11 0.193 1.35
## 3 3 44 13 0.295 2.07
## 4 4 43 17 0.395 2.77
## 5 5 30 9 0.3 2.1
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 23 30 1.30 1
## 2 2 27 41 1.52 1.16
## 3 3 20 44 2.2 1.69
## 4 4 14 47 3.36 2.57
## 5 5 6 84 14 10.7
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH_adjust
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 3 10 3.33 1
## 2 2 7 5 0.714 0.214
## 3 3 9 7 0.778 0.233
## 4 4 3 5 1.67 0.5
## 5 5 2 35 17.5 5.25
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
Extract the odds ratios by ancestry for plotting
# Extracting quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary$odds_ratio[5],
combined_odds_summary_AFR$odds_ratio[5],
combined_odds_summary_HIS$odds_ratio[5],
combined_odds_summary_EUR$odds_ratio[5],
combined_odds_summary_OTH$odds_ratio[5]),
lower_ci = c(combined_odds_summary$lower_ci[5],
combined_odds_summary_AFR$lower_ci[5],
combined_odds_summary_HIS$lower_ci[5],
combined_odds_summary_EUR$lower_ci[5],
combined_odds_summary_OTH$lower_ci[5]),
upper_ci = c(combined_odds_summary$upper_ci[5],
combined_odds_summary_AFR$upper_ci[5],
combined_odds_summary_HIS$upper_ci[5],
combined_odds_summary_EUR$upper_ci[5],
combined_odds_summary_OTH$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)
# Plotting
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups", xaxt = "n", col = "#3C8474")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
Now lets compute the odds ratios by 5th quintile and ancestry corrected post-integration of the variables
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
combined_odds_summary_AFR_adjust$odds_ratio[5],
combined_odds_summary_HIS_adjust$odds_ratio[5],
combined_odds_summary_EUR_adjust$odds_ratio[5],
combined_odds_summary_OTH_adjust$odds_ratio[5]),
lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
combined_odds_summary_AFR_adjust$lower_ci[5],
combined_odds_summary_HIS_adjust$lower_ci[5],
combined_odds_summary_EUR_adjust$lower_ci[5],
combined_odds_summary_OTH_adjust$lower_ci[5]),
upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
combined_odds_summary_AFR_adjust$upper_ci[5],
combined_odds_summary_HIS_adjust$upper_ci[5],
combined_odds_summary_EUR_adjust$upper_ci[5],
combined_odds_summary_OTH_adjust$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)
# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "black")
Now lets compute the odds ratios by 5th quintile and ancestry corrected pre-integration of the variables
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank_adjusted = rank(Scores_ANCESTRY_adjusted, ties.method = "first"),
score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
)
#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]
# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)
combined_odds_summary
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 156 42 0.269 1
## 2 2 141 58 0.411 1.53
## 3 3 123 75 0.610 2.26
## 4 4 88 111 1.26 4.69
## 5 5 29 170 5.86 21.8
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 56 11 0.196 1
## 2 2 48 11 0.229 1.17
## 3 3 42 19 0.452 2.30
## 4 4 36 23 0.639 3.25
## 5 5 11 26 2.36 12.0
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 64 6 0.0938 1
## 2 2 66 11 0.167 1.78
## 3 3 54 13 0.241 2.57
## 4 4 38 21 0.553 5.89
## 5 5 8 7 0.875 9.33
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 29 15 0.517 1
## 2 2 22 33 1.5 2.9
## 3 3 19 39 2.05 3.97
## 4 4 13 59 4.54 8.77
## 5 5 7 100 14.3 27.6
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH
## # A tibble: 5 × 8
## score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
## <dbl> <int> <int> <dbl> <dbl>
## 1 1 7 10 1.43 1
## 2 2 5 3 0.6 0.42
## 3 3 8 4 0.5 0.35
## 4 4 1 8 8 5.6
## 5 5 3 37 12.3 8.63
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
combined_odds_summary_AFR_adjust$odds_ratio[5],
combined_odds_summary_HIS_adjust$odds_ratio[5],
combined_odds_summary_EUR_adjust$odds_ratio[5],
combined_odds_summary_OTH_adjust$odds_ratio[5]),
lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
combined_odds_summary_AFR_adjust$lower_ci[5],
combined_odds_summary_HIS_adjust$lower_ci[5],
combined_odds_summary_EUR_adjust$lower_ci[5],
combined_odds_summary_OTH_adjust$lower_ci[5]),
upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
combined_odds_summary_AFR_adjust$upper_ci[5],
combined_odds_summary_HIS_adjust$upper_ci[5],
combined_odds_summary_EUR_adjust$upper_ci[5],
combined_odds_summary_OTH_adjust$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)
# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "black")
Plot the ROC for GAPS on the data adjusted for ancestry post-integration
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full_adjusted <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_adjusted)
## Area under the curve: 0.7589
Plot the ROC for GAPS on the data adjusted for ancestry post-integration on the replication cohort
# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
data = filtered_categorized_combined_data)
ancestry_coefficients <- coef(ancestry_regression_model)
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data$adjusted_scores_replication <- categorized_replication_data$scores_replication -
(ancestry_coefficients["(Intercept)"] +
categorized_replication_data$PC1 * ancestry_coefficients["PC1"] +
categorized_replication_data$PC2 * ancestry_coefficients["PC2"] +
categorized_replication_data$PC3 * ancestry_coefficients["PC3"] +
categorized_replication_data$PC4 * ancestry_coefficients["PC4"] +
categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
categorized_replication_data$PC10 * ancestry_coefficients["PC10"] )
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data <- categorized_replication_data %>%
mutate(adjusted_probability_replication = plogis(adjusted_scores_replication))
# Compute the ROC curve
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_adjusted)
## Area under the curve: 0.7918
# Plot the ROC curve with the x-axis reversed
plot(roc_result_full_adjusted,xlim = c(1, 0))
Perform iterations of 50(5-fold) validation of the discovery cohort and plot the dostribution of outcomes
# number of iterations
n_repeats <- 50
# Store results
auc_results <- data.frame()
for (i in 1:n_repeats) {
set.seed(i)
# cross-validation method
cv_folds <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)
# Train models again
cv_model_cardiomyopathy_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_epilepsy_rare <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_noncoding_rare <- suppressWarnings(train(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_common <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_coding_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_epilepsy_and_common <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_cardiac_and_common <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_common_and_coding <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
cv_model_full <- suppressWarnings(
train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV +
Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare +
PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))
# Store results for this iteration
temp_results <- data.frame(
Model = c("Cardiomyopathy Rare", "Epilepsy Rare", "Noncoding Rare", "Common Traits", "Coding Rare",
"Epilepsy & Common", "Cardiac & Common", "Common & Coding", "Common, Coding & Noncoding"),
AUC = c(
max(cv_model_cardiomyopathy_rare$results$ROC),
max(cv_model_epilepsy_rare$results$ROC),
max(cv_model_noncoding_rare$results$ROC),
max(cv_model_common$results$ROC),
max(cv_model_coding_rare$results$ROC),
max(cv_model_epilepsy_and_common$results$ROC),
max(cv_model_cardiac_and_common$results$ROC),
max(cv_model_common_and_coding$results$ROC),
max(cv_model_full$results$ROC)
),
Iteration = i
)
auc_results <- rbind(auc_results, temp_results)
}
# WA color palette for Millenial enjoyment
AUC_palette <- c(wes_palette("Royal1"), wes_palette("Royal2"))
# Boxplot AUC distribution
ggplot(auc_results, aes(x = Model, y = AUC, fill = Model)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = AUC_palette) + # Apply custom palette
theme_minimal() +
labs(title = "AUC Distributions Across Models", x = "Model", y = "AUC") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_cowplot()
library(dplyr)
summary_stats <- auc_results %>%
group_by(Model) %>%
summarise(Median_AUC = median(AUC),
IQR_AUC = IQR(AUC))
print(summary_stats)
## # A tibble: 9 × 3
## Model Median_AUC IQR_AUC
## <chr> <dbl> <dbl>
## 1 Cardiac & Common 0.717 0.00474
## 2 Cardiomyopathy Rare 0.683 0.00300
## 3 Coding Rare 0.689 0.00460
## 4 Common & Coding 0.720 0.00697
## 5 Common Traits 0.663 0.00671
## 6 Common, Coding & Noncoding 0.738 0.00610
## 7 Epilepsy & Common 0.684 0.00727
## 8 Epilepsy Rare 0.640 0.00332
## 9 Noncoding Rare 0.581 0.00289
Now down-sample 20 different times and re-run the 5-fold validations so that the ancestries match.
# Initialize a dataframe to store all AUCs
auc_results <- data.frame(
Outer_Fold = integer(),
Inner_Fold = integer(),
AUC = numeric(),
stringsAsFactors = FALSE
)
# Define number of outer folds
outer_folds <- 20
inner_folds <- 5
# Outer loop: 20 iterations
for (i in 1:outer_folds) {
# Create a new randomly downsampled dataset (ensuring balanced case/control per ancestry)
downsampled_data <- bind_rows(
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "zCase") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "Control") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "zCase") %>% slice_sample(n = 58),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "Control") %>% slice_sample(n = 58),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "zCase") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "Control") %>% slice_sample(n = 90),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "zCase") %>% slice_sample(n = 24),
categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "Control") %>% slice_sample(n = 24)
)
# Ensure binary outcome is properly coded
downsampled_data$binary_outcome <- ifelse(downsampled_data$Category == "zCase", 1, 0)
# Create 5 stratified folds **by both ancestry and category**
folds <- downsampled_data %>%
group_by(cluster_gmm_Ancestry, Category) %>%
mutate(Fold = sample(rep(1:inner_folds, length.out = n()))) %>%
ungroup()
# Train and test across the 5 folds within this downsampled dataset
inner_auc_values <- numeric(inner_folds)
for (j in 1:inner_folds) {
# Split into training and testing sets
train_data <- folds %>% filter(Fold != j) %>% dplyr::select(-Fold)
test_data <- folds %>% filter(Fold == j) %>% dplyr::select(-Fold)
# Train model
cv_model <- suppressWarnings(
train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV +
Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare +
PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare +
Cardiomyopathy_null + Epilepsy_null,
data = train_data, method = "glm", family = binomial)
)
# Get predicted probabilities for the positive class (zCase)
test_data$predicted_prob <- predict(cv_model, test_data, type = "prob")[, "zCase"]
# Compute ROC curve and AUC
roc_result <- roc(test_data$binary_outcome, test_data$predicted_prob)
inner_auc_values[j] <- auc(roc_result)
# Store the AUC result
auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = j, AUC = inner_auc_values[j]))
}
# Store the mean AUC from the 5 inner folds for this outer iteration
auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = NA, AUC = mean(inner_auc_values)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Ensure Inner_Fold is treated as character for filtering
auc_results <- auc_results %>%
mutate(Inner_Fold = as.character(Inner_Fold))
# Extract inner fold AUCs for the histogram
inner_auc_data <- auc_results %>%
filter(Inner_Fold != "Mean") # Exclude previous Mean entries if they exist
# Compute mean AUC per outer fold
outer_auc_means <- inner_auc_data %>%
group_by(Outer_Fold) %>%
summarise(AUC = mean(AUC)) %>%
ungroup()
# Ensure AUC is numeric
inner_auc_data$AUC <- as.numeric(inner_auc_data$AUC)
outer_auc_means$AUC <- as.numeric(outer_auc_means$AUC)
# Print to verify correct means
print(outer_auc_means)
## # A tibble: 20 × 2
## Outer_Fold AUC
## <int> <dbl>
## 1 1 0.668
## 2 2 0.676
## 3 3 0.665
## 4 4 0.676
## 5 5 0.699
## 6 6 0.686
## 7 7 0.678
## 8 8 0.684
## 9 9 0.675
## 10 10 0.682
## 11 11 0.652
## 12 12 0.682
## 13 13 0.673
## 14 14 0.604
## 15 15 0.669
## 16 16 0.672
## 17 17 0.676
## 18 18 0.682
## 19 19 0.681
## 20 20 0.676
Now plot the distribution of all the down-sampled, ancestry-matched AUC results, layering the means of each outer fold on top
# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(downsampled_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
xlab("Category") +
ylab("Percentage") +
theme(legend.position = "bottom") +
scale_fill_manual(values = palette_colors) +
theme_cowplot(12)
ancestry_makeup
# Create histogram with cowplot styling
auc_histogram <- ggplot(inner_auc_data, aes(x = AUC)) +
geom_histogram(binwidth = 0.01, fill = "lightblue", color = "black") +
geom_vline(data = outer_auc_means, aes(xintercept = AUC), color = "black", linetype = "dashed", size = 0.5) +
geom_vline(xintercept = 0.5, color = "red", linetype = "solid", size = 1) +
xlab("AUC") +
ylab("Frequency") +
ggtitle("Histogram of Inner Fold AUCs with Outer Fold Means") +
theme_cowplot(12)
auc_histogram
# Compute mean of outer fold AUCs
mean_outer_auc <- outer_auc_means %>%
summarise(Mean_AUC = mean(AUC)) %>%
pull(Mean_AUC)
# Print the mean AUC value
print(paste("Mean AUC across outer folds:", mean_outer_auc))
## [1] "Mean AUC across outer folds: 0.672768927539058"
# Load the reporter data
reporter_data <- read.delim("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Reporter_analysis.txt", check.names = FALSE)
reporter_long <- reporter_data %>%
pivot_longer(cols = everything(), names_to = "Condition", values_to = "Activity")
library(ggplot2)
ggplot(reporter_long, aes(x = Condition, y = Activity, fill = Condition)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.6, color = "black") +
theme_minimal(base_size = 14) +
theme(legend.position = "none") +
ylab("Reporter Activity") +
xlab("") +
coord_cartesian(ylim = c(0, max(reporter_long$Activity, na.rm = TRUE) * 1.1)) +
ggtitle("Reporter Assay Activity by Condition")